Merge branch 'main' into users/meinersbur/ide_folders_llvmupstream/users/meinersbur/ide_folders_llvm

author: Michael Kruse <llvm-project@meinersbur.de> 2024-05-25 17:21:09 +0200
committer: Michael Kruse <llvm-project@meinersbur.de> 2024-05-25 17:21:09 +0200
commit: 062fdd4f4439c00437fef07488e994a6ff10bb5d (patch)
tree: 79297e3188951f7b98d10f3d67a92f4df75bac80
parent: 0e864bbd4142cf202aa9ffd66eb67c9528c0f452 (diff)
parent: 9da81cee219da78ab44357310a3bcf481bdba26c (diff)
2554 files changed, 52298 insertions, 22649 deletions
diff --git a/.ci/generate-buildkite-pipeline-premerge b/.ci/generate-buildkite-pipeline-premerge
index 78a9cb77ff7d..e1c66ac18e7a 100755
--- a/.ci/generate-buildkite-pipeline-premerge
+++ b/.ci/generate-buildkite-pipeline-premerge
@@ -68,7 +68,7 @@ function compute-projects-to-test() {
       done
     ;;
     clang)
-      for p in clang-tools-extra compiler-rt flang lldb cross-project-tests; do
+      for p in clang-tools-extra compiler-rt lldb cross-project-tests; do
         echo $p
       done
     ;;
diff --git a/bolt/docs/BAT.md b/bolt/docs/BAT.md
index 7ffb5d7c0081..817ad288aa34 100644
--- a/bolt/docs/BAT.md
+++ b/bolt/docs/BAT.md
@@ -106,9 +106,14 @@ equals output offset.
 `BRANCHENTRY` bit denotes whether a given offset pair is a control flow source
 (branch or call instruction). If not set, it signifies a control flow target
 (basic block offset).
+
 `InputAddr` is omitted for equal offsets in input and output function. In this
 case, `BRANCHENTRY` bits are encoded separately in a `BranchEntries` bitvector.
 
+Deleted basic blocks are emitted as having `OutputOffset` equal to the size of
+the function. They don't affect address translation and only participate in
+input basic block mapping.
+
 ### Secondary Entry Points table
 The table is emitted for hot fragments only. It contains `NumSecEntryPoints`
 offsets denoting secondary entry points, delta encoded, implicitly starting at zero.
diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 75765819ac46..4ec3de3da1bf 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -17,6 +17,7 @@
 #include "bolt/Core/BinaryData.h"
 #include "bolt/Core/BinarySection.h"
 #include "bolt/Core/DebugData.h"
+#include "bolt/Core/DynoStats.h"
 #include "bolt/Core/JumpTable.h"
 #include "bolt/Core/MCPlusBuilder.h"
 #include "bolt/RuntimeLibs/RuntimeLibrary.h"
@@ -359,7 +360,7 @@ public:
   void setFileBuildID(StringRef ID) { FileBuildID = std::string(ID); }
 
   bool hasSymbolsWithFileName() const { return HasSymbolsWithFileName; }
-  void setHasSymbolsWithFileName(bool Value) { HasSymbolsWithFileName = true; }
+  void setHasSymbolsWithFileName(bool Value) { HasSymbolsWithFileName = Value; }
 
   /// Return true if relocations against symbol with a given name
   /// must be created.
@@ -677,6 +678,9 @@ public:
   /// have an origin file name available.
   bool HasSymbolsWithFileName{false};
 
+  /// Does the binary have BAT section.
+  bool HasBATSection{false};
+
   /// Sum of execution count of all functions
   uint64_t SumExecutionCount{0};
 
@@ -714,6 +718,9 @@ public:
     uint64_t NumStaleBlocksWithEqualIcount{0};
   } Stats;
 
+  // Original binary execution count stats.
+  DynoStats InitialDynoStats;
+
   // Address of the first allocated segment.
   uint64_t FirstAllocAddress{std::numeric_limits<uint64_t>::max()};
 
@@ -1217,8 +1224,7 @@ public:
 
   /// Return a signed value of \p Size stored at \p Address. The address has
   /// to be a valid statically allocated address for the binary.
-  ErrorOr<uint64_t> getSignedValueAtAddress(uint64_t Address,
-                                            size_t Size) const;
+  ErrorOr<int64_t> getSignedValueAtAddress(uint64_t Address, size_t Size) const;
 
   /// Special case of getUnsignedValueAtAddress() that uses a pointer size.
   ErrorOr<uint64_t> getPointerAtAddress(uint64_t Address) const {
diff --git a/bolt/include/bolt/Passes/BinaryPasses.h b/bolt/include/bolt/Passes/BinaryPasses.h
index 5d7692559eda..ad8473c4aae0 100644
--- a/bolt/include/bolt/Passes/BinaryPasses.h
+++ b/bolt/include/bolt/Passes/BinaryPasses.h
@@ -16,6 +16,7 @@
 #include "bolt/Core/BinaryContext.h"
 #include "bolt/Core/BinaryFunction.h"
 #include "bolt/Core/DynoStats.h"
+#include "bolt/Profile/BoltAddressTranslation.h"
 #include "llvm/Support/CommandLine.h"
 #include <atomic>
 #include <set>
@@ -52,15 +53,31 @@ public:
   virtual Error runOnFunctions(BinaryContext &BC) = 0;
 };
 
+/// A pass to set initial program-wide dynostats.
+class DynoStatsSetPass : public BinaryFunctionPass {
+public:
+  DynoStatsSetPass() : BinaryFunctionPass(false) {}
+
+  const char *getName() const override {
+    return "set dyno-stats before optimizations";
+  }
+
+  bool shouldPrint(const BinaryFunction &BF) const override { return false; }
+
+  Error runOnFunctions(BinaryContext &BC) override {
+    BC.InitialDynoStats = getDynoStats(BC.getBinaryFunctions(), BC.isAArch64());
+    return Error::success();
+  }
+};
+
 /// A pass to print program-wide dynostats.
 class DynoStatsPrintPass : public BinaryFunctionPass {
 protected:
-  DynoStats PrevDynoStats;
   std::string Title;
 
 public:
-  DynoStatsPrintPass(const DynoStats &PrevDynoStats, const char *Title)
-      : BinaryFunctionPass(false), PrevDynoStats(PrevDynoStats), Title(Title) {}
+  DynoStatsPrintPass(const char *Title)
+      : BinaryFunctionPass(false), Title(Title) {}
 
   const char *getName() const override {
     return "print dyno-stats after optimizations";
@@ -69,6 +86,7 @@ public:
   bool shouldPrint(const BinaryFunction &BF) const override { return false; }
 
   Error runOnFunctions(BinaryContext &BC) override {
+    const DynoStats PrevDynoStats = BC.InitialDynoStats;
     const DynoStats NewDynoStats =
         getDynoStats(BC.getBinaryFunctions(), BC.isAArch64());
     const bool Changed = (NewDynoStats != PrevDynoStats);
@@ -399,8 +417,11 @@ public:
 /// Prints a list of the top 100 functions sorted by a set of
 /// dyno stats categories.
 class PrintProgramStats : public BinaryFunctionPass {
+  BoltAddressTranslation *BAT = nullptr;
+
 public:
-  explicit PrintProgramStats() : BinaryFunctionPass(false) {}
+  explicit PrintProgramStats(BoltAddressTranslation *BAT = nullptr)
+      : BinaryFunctionPass(false), BAT(BAT) {}
 
   const char *getName() const override { return "print-stats"; }
   bool shouldPrint(const BinaryFunction &) const override { return false; }
diff --git a/bolt/include/bolt/Passes/MCF.h b/bolt/include/bolt/Passes/MCF.h
index feac7f88ac11..3fe674463bf1 100644
--- a/bolt/include/bolt/Passes/MCF.h
+++ b/bolt/include/bolt/Passes/MCF.h
@@ -9,20 +9,14 @@
 #ifndef BOLT_PASSES_MCF_H
 #define BOLT_PASSES_MCF_H
 
+#include "bolt/Passes/BinaryPasses.h"
+#include "llvm/Support/CommandLine.h"
+
 namespace llvm {
 namespace bolt {
 
-class BinaryFunction;
 class DataflowInfoManager;
 
-enum MCFCostFunction : char {
-  MCF_DISABLE = 0,
-  MCF_LINEAR,
-  MCF_QUADRATIC,
-  MCF_LOG,
-  MCF_BLAMEFTS
-};
-
 /// Implement the idea in "SamplePGO - The Power of Profile Guided Optimizations
 /// without the Usability Burden" by Diego Novillo to make basic block counts
 /// equal if we show that A dominates B, B post-dominates A and they are in the
@@ -31,23 +25,18 @@ void equalizeBBCounts(DataflowInfoManager &Info, BinaryFunction &BF);
 
 /// Fill edge counts based on the basic block count. Used in nonLBR mode when
 /// we only have bb count.
-void estimateEdgeCounts(BinaryFunction &BF);
-
-/// Entry point for computing a min-cost flow for the CFG with the goal
-/// of fixing the flow of the CFG edges, that is, making sure it obeys the
-/// flow-conservation equation  SumInEdges = SumOutEdges.
-///
-/// To do this, we create an instance of the min-cost flow problem in a
-/// similar way as the one discussed in the work of Roy Levin "Completing
-/// Incomplete Edge Profile by Applying Minimum Cost Circulation Algorithms".
-/// We do a few things differently, though. We don't populate edge counts using
-/// weights coming from a static branch prediction technique and we don't
-/// use the same cost function.
-///
-/// If cost function BlameFTs is used, assign all remaining flow to
-/// fall-throughs. This is used when the sampling is based on taken branches
-/// that do not account for them.
-void solveMCF(BinaryFunction &BF, MCFCostFunction CostFunction);
+class EstimateEdgeCounts : public BinaryFunctionPass {
+  void runOnFunction(BinaryFunction &BF);
+
+public:
+  explicit EstimateEdgeCounts(const cl::opt<bool> &PrintPass)
+      : BinaryFunctionPass(PrintPass) {}
+
+  const char *getName() const override { return "estimate-edge-counts"; }
+
+  /// Pass entry point
+  Error runOnFunctions(BinaryContext &BC) override;
+};
 
 } // end namespace bolt
 } // end namespace llvm
diff --git a/bolt/include/bolt/Passes/StokeInfo.h b/bolt/include/bolt/Passes/StokeInfo.h
index 76417e6a2c3b..a18c2a05d015 100644
--- a/bolt/include/bolt/Passes/StokeInfo.h
+++ b/bolt/include/bolt/Passes/StokeInfo.h
@@ -87,10 +87,10 @@ struct StokeFuncInfo {
               << "," << NumBlocks << "," << IsLoopFree << "," << NumLoops << ","
               << MaxLoopDepth << "," << HotSize << "," << TotalSize << ","
               << Score << "," << HasCall << ",\"{ ";
-      for (std::string S : DefIn)
+      for (const std::string &S : DefIn)
         Outfile << "%" << S << " ";
       Outfile << "}\",\"{ ";
-      for (std::string S : LiveOut)
+      for (const std::string &S : LiveOut)
         Outfile << "%" << S << " ";
       Outfile << "}\"," << HeapOut << "," << StackOut << "," << HasRipAddr
               << "," << Omitted << "\n";
diff --git a/bolt/include/bolt/Profile/BoltAddressTranslation.h b/bolt/include/bolt/Profile/BoltAddressTranslation.h
index 68b993ee363c..65b9ba874368 100644
--- a/bolt/include/bolt/Profile/BoltAddressTranslation.h
+++ b/bolt/include/bolt/Profile/BoltAddressTranslation.h
@@ -70,7 +70,7 @@ class BinaryFunction;
 class BoltAddressTranslation {
 public:
   // In-memory representation of the address translation table
-  using MapTy = std::map<uint32_t, uint32_t>;
+  using MapTy = std::multimap<uint32_t, uint32_t>;
 
   // List of taken fall-throughs
   using FallthroughListTy = SmallVector<std::pair<uint64_t, uint64_t>, 16>;
@@ -90,7 +90,7 @@ public:
   std::error_code parse(raw_ostream &OS, StringRef Buf);
 
   /// Dump the parsed address translation tables
-  void dump(raw_ostream &OS);
+  void dump(raw_ostream &OS) const;
 
   /// If the maps are loaded in memory, perform the lookup to translate LBR
   /// addresses in function located at \p FuncAddress.
@@ -107,7 +107,12 @@ public:
 
   /// If available, fetch the address of the hot part linked to the cold part
   /// at \p Address. Return 0 otherwise.
-  uint64_t fetchParentAddress(uint64_t Address) const;
+  uint64_t fetchParentAddress(uint64_t Address) const {
+    auto Iter = ColdPartSource.find(Address);
+    if (Iter == ColdPartSource.end())
+      return 0;
+    return Iter->second;
+  }
 
   /// True if the input binary has a translation table we can use to convert
   /// addresses when aggregating profile
@@ -132,7 +137,8 @@ private:
   /// emitted for the start of the BB. More entries may be emitted to cover
   /// the location of calls or any instruction that may change control flow.
   void writeEntriesForBB(MapTy &Map, const BinaryBasicBlock &BB,
-                         uint64_t FuncInputAddress, uint64_t FuncOutputAddress);
+                         uint64_t FuncInputAddress,
+                         uint64_t FuncOutputAddress) const;
 
   /// Write the serialized address translation table for a function.
   template <bool Cold>
@@ -147,7 +153,7 @@ private:
 
   /// Returns the bitmask with set bits corresponding to indices of BRANCHENTRY
   /// entries in function address translation map.
-  APInt calculateBranchEntriesBitMask(MapTy &Map, size_t EqualElems);
+  APInt calculateBranchEntriesBitMask(MapTy &Map, size_t EqualElems) const;
 
   /// Calculate the number of equal offsets (output = input - skew) in the
   /// beginning of the function.
@@ -178,14 +184,9 @@ private:
 public:
   /// Map basic block input offset to a basic block index and hash pair.
   class BBHashMapTy {
-    class EntryTy {
+    struct EntryTy {
       unsigned Index;
       size_t Hash;
-
-    public:
-      unsigned getBBIndex() const { return Index; }
-      size_t getBBHash() const { return Hash; }
-      EntryTy(unsigned Index, size_t Hash) : Index(Index), Hash(Hash) {}
     };
 
     std::map<uint32_t, EntryTy> Map;
@@ -201,15 +202,15 @@ public:
     }
 
     unsigned getBBIndex(uint32_t BBInputOffset) const {
-      return getEntry(BBInputOffset).getBBIndex();
+      return getEntry(BBInputOffset).Index;
     }
 
     size_t getBBHash(uint32_t BBInputOffset) const {
-      return getEntry(BBInputOffset).getBBHash();
+      return getEntry(BBInputOffset).Hash;
     }
 
     void addEntry(uint32_t BBInputOffset, unsigned BBIndex, size_t BBHash) {
-      Map.emplace(BBInputOffset, EntryTy(BBIndex, BBHash));
+      Map.emplace(BBInputOffset, EntryTy{BBIndex, BBHash});
     }
 
     size_t getNumBasicBlocks() const { return Map.size(); }
@@ -217,18 +218,14 @@ public:
     auto begin() const { return Map.begin(); }
     auto end() const { return Map.end(); }
     auto upper_bound(uint32_t Offset) const { return Map.upper_bound(Offset); }
+    auto size() const { return Map.size(); }
   };
 
   /// Map function output address to its hash and basic blocks hash map.
   class FuncHashesTy {
-    class EntryTy {
+    struct EntryTy {
       size_t Hash;
       BBHashMapTy BBHashMap;
-
-    public:
-      size_t getBFHash() const { return Hash; }
-      const BBHashMapTy &getBBHashMap() const { return BBHashMap; }
-      EntryTy(size_t Hash) : Hash(Hash) {}
     };
 
     std::unordered_map<uint64_t, EntryTy> Map;
@@ -240,15 +237,15 @@ public:
 
   public:
     size_t getBFHash(uint64_t FuncOutputAddress) const {
-      return getEntry(FuncOutputAddress).getBFHash();
+      return getEntry(FuncOutputAddress).Hash;
     }
 
     const BBHashMapTy &getBBHashMap(uint64_t FuncOutputAddress) const {
-      return getEntry(FuncOutputAddress).getBBHashMap();
+      return getEntry(FuncOutputAddress).BBHashMap;
     }
 
     void addEntry(uint64_t FuncOutputAddress, size_t BFHash) {
-      Map.emplace(FuncOutputAddress, EntryTy(BFHash));
+      Map.emplace(FuncOutputAddress, EntryTy{BFHash, BBHashMapTy()});
     }
 
     size_t getNumFunctions() const { return Map.size(); };
@@ -256,7 +253,7 @@ public:
     size_t getNumBasicBlocks() const {
       size_t NumBasicBlocks{0};
       for (auto &I : Map)
-        NumBasicBlocks += I.second.getBBHashMap().getNumBasicBlocks();
+        NumBasicBlocks += I.second.BBHashMap.getNumBasicBlocks();
       return NumBasicBlocks;
     }
   };
@@ -278,7 +275,9 @@ public:
 
   /// Returns the number of basic blocks in a function.
   size_t getNumBasicBlocks(uint64_t OutputAddress) const {
-    return NumBasicBlocksMap.at(OutputAddress);
+    auto It = NumBasicBlocksMap.find(OutputAddress);
+    assert(It != NumBasicBlocksMap.end());
+    return It->second;
   }
 
 private:
diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index c158a9bb3e3f..6453b3070ceb 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -15,6 +15,7 @@
 #define BOLT_PROFILE_DATA_AGGREGATOR_H
 
 #include "bolt/Profile/DataReader.h"
+#include "bolt/Profile/YAMLProfileWriter.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Program.h"
@@ -248,7 +249,7 @@ private:
   BinaryFunction *getBATParentFunction(const BinaryFunction &Func) const;
 
   /// Retrieve the location name to be used for samples recorded in \p Func.
-  StringRef getLocationName(const BinaryFunction &Func) const;
+  static StringRef getLocationName(const BinaryFunction &Func, bool BAT);
 
   /// Semantic actions - parser hooks to interpret parsed perf samples
   /// Register a sample (non-LBR mode), i.e. a new hit at \p Address
@@ -490,6 +491,8 @@ public:
   /// Parse the output generated by "perf buildid-list" to extract build-ids
   /// and return a file name matching a given \p FileBuildID.
   std::optional<StringRef> getFileNameForBuildID(StringRef FileBuildID);
+
+  friend class YAMLProfileWriter;
 };
 } // namespace bolt
 } // namespace llvm
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index ad2eb18caf10..db02dc0fae4e 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -142,7 +142,7 @@ BinaryContext::BinaryContext(std::unique_ptr<MCContext> Ctx,
       AsmInfo(std::move(AsmInfo)), MII(std::move(MII)), STI(std::move(STI)),
       InstPrinter(std::move(InstPrinter)), MIA(std::move(MIA)),
       MIB(std::move(MIB)), MRI(std::move(MRI)), DisAsm(std::move(DisAsm)),
-      Logger(Logger) {
+      Logger(Logger), InitialDynoStats(isAArch64()) {
   Relocation::Arch = this->TheTriple->getArch();
   RegularPageSize = isAArch64() ? RegularPageSizeAArch64 : RegularPageSizeX86;
   PageAlign = opts::NoHugePages ? RegularPageSize : HugePageSize;
@@ -934,10 +934,13 @@ std::string BinaryContext::generateJumpTableName(const BinaryFunction &BF,
   uint64_t Offset = 0;
   if (const JumpTable *JT = BF.getJumpTableContainingAddress(Address)) {
     Offset = Address - JT->getAddress();
-    auto Itr = JT->Labels.find(Offset);
-    if (Itr != JT->Labels.end())
-      return std::string(Itr->second->getName());
-    Id = JumpTableIds.at(JT->getAddress());
+    auto JTLabelsIt = JT->Labels.find(Offset);
+    if (JTLabelsIt != JT->Labels.end())
+      return std::string(JTLabelsIt->second->getName());
+
+    auto JTIdsIt = JumpTableIds.find(JT->getAddress());
+    assert(JTIdsIt != JumpTableIds.end());
+    Id = JTIdsIt->second;
   } else {
     Id = JumpTableIds[Address] = BF.JumpTables.size();
   }
@@ -1322,7 +1325,9 @@ void BinaryContext::processInterproceduralReferences() {
        InterproceduralReferences) {
     BinaryFunction &Function = *It.first;
     uint64_t Address = It.second;
-    if (!Address || Function.isIgnored())
+    // Process interprocedural references from ignored functions in BAT mode
+    // (non-simple in non-relocation mode) to properly register entry points
+    if (!Address || (Function.isIgnored() && !HasBATSection))
       continue;
 
     BinaryFunction *TargetFunction =
@@ -2212,8 +2217,8 @@ ErrorOr<uint64_t> BinaryContext::getUnsignedValueAtAddress(uint64_t Address,
   return DE.getUnsigned(&ValueOffset, Size);
 }
 
-ErrorOr<uint64_t> BinaryContext::getSignedValueAtAddress(uint64_t Address,
-                                                         size_t Size) const {
+ErrorOr<int64_t> BinaryContext::getSignedValueAtAddress(uint64_t Address,
+                                                        size_t Size) const {
   const ErrorOr<const BinarySection &> Section = getSectionForAddress(Address);
   if (!Section)
     return std::make_error_code(std::errc::bad_address);
diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index 6f86ddc77454..0b44acb0816f 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -813,7 +813,9 @@ void BinaryEmitter::emitJumpTable(const JumpTable &JT, MCSection *HotSection,
   // determining its destination.
   std::map<MCSymbol *, uint64_t> LabelCounts;
   if (opts::JumpTables > JTS_SPLIT && !JT.Counts.empty()) {
-    MCSymbol *CurrentLabel = JT.Labels.at(0);
+    auto It = JT.Labels.find(0);
+    assert(It != JT.Labels.end());
+    MCSymbol *CurrentLabel = It->second;
     uint64_t CurrentLabelCount = 0;
     for (unsigned Index = 0; Index < JT.Entries.size(); ++Index) {
       auto LI = JT.Labels.find(Index * JT.EntrySize);
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 10b93e702984..c897392f2a57 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -851,15 +851,19 @@ BinaryFunction::processIndirectBranch(MCInst &Instruction, unsigned Size,
     return IndirectBranchType::UNKNOWN;
   }
 
-  // RIP-relative addressing should be converted to symbol form by now
-  // in processed instructions (but not in jump).
-  if (DispExpr) {
+  auto getExprValue = [&](const MCExpr *Expr) {
     const MCSymbol *TargetSym;
     uint64_t TargetOffset;
-    std::tie(TargetSym, TargetOffset) = BC.MIB->getTargetSymbolInfo(DispExpr);
+    std::tie(TargetSym, TargetOffset) = BC.MIB->getTargetSymbolInfo(Expr);
     ErrorOr<uint64_t> SymValueOrError = BC.getSymbolValue(*TargetSym);
-    assert(SymValueOrError && "global symbol needs a value");
-    ArrayStart = *SymValueOrError + TargetOffset;
+    assert(SymValueOrError && "Global symbol needs a value");
+    return *SymValueOrError + TargetOffset;
+  };
+
+  // RIP-relative addressing should be converted to symbol form by now
+  // in processed instructions (but not in jump).
+  if (DispExpr) {
+    ArrayStart = getExprValue(DispExpr);
     BaseRegNum = BC.MIB->getNoRegister();
     if (BC.isAArch64()) {
       ArrayStart &= ~0xFFFULL;
@@ -1666,7 +1670,8 @@ void BinaryFunction::postProcessEntryPoints() {
     // In non-relocation mode there's potentially an external undetectable
     // reference to the entry point and hence we cannot move this entry
     // point. Optimizing without moving could be difficult.
-    if (!BC.HasRelocations)
+    // In BAT mode, register any known entry points for CFG construction.
+    if (!BC.HasRelocations && !BC.HasBATSection)
       setSimple(false);
 
     const uint32_t Offset = KV.first;
@@ -3697,6 +3702,13 @@ BinaryFunction::BasicBlockListType BinaryFunction::dfs() const {
 
 size_t BinaryFunction::computeHash(bool UseDFS, HashFunction HashFunction,
                                    OperandHashFuncTy OperandHashFunc) const {
+  LLVM_DEBUG({
+    dbgs() << "BOLT-DEBUG: computeHash " << getPrintName() << ' '
+           << (UseDFS ? "dfs" : "bin") << " order "
+           << (HashFunction == HashFunction::StdHash ? "std::hash" : "xxh3")
+           << '\n';
+  });
+
   if (size() == 0)
     return 0;
 
diff --git a/bolt/lib/Core/DebugNames.cpp b/bolt/lib/Core/DebugNames.cpp
index 049244c4b515..791cbc6df082 100644
--- a/bolt/lib/Core/DebugNames.cpp
+++ b/bolt/lib/Core/DebugNames.cpp
@@ -112,8 +112,6 @@ void DWARF5AcceleratorTable::addUnit(DWARFUnit &Unit,
 // Returns true if DW_TAG_variable should be included in .debug-names based on
 // section 6.1.1.1 for DWARF5 spec.
 static bool shouldIncludeVariable(const DWARFUnit &Unit, const DIE &Die) {
-  if (Die.findAttribute(dwarf::Attribute::DW_AT_declaration))
-    return false;
   const DIEValue LocAttrInfo =
       Die.findAttribute(dwarf::Attribute::DW_AT_location);
   if (!LocAttrInfo)
@@ -148,6 +146,8 @@ static bool shouldIncludeVariable(const DWARFUnit &Unit, const DIE &Die) {
 
 bool static canProcess(const DWARFUnit &Unit, const DIE &Die,
                        std::string &NameToUse, const bool TagsOnly) {
+  if (Die.findAttribute(dwarf::Attribute::DW_AT_declaration))
+    return false;
   switch (Die.getTag()) {
   case dwarf::DW_TAG_base_type:
   case dwarf::DW_TAG_class_type:
diff --git a/bolt/lib/Core/DynoStats.cpp b/bolt/lib/Core/DynoStats.cpp
index 5de0f9e0d6b8..1d9818777596 100644
--- a/bolt/lib/Core/DynoStats.cpp
+++ b/bolt/lib/Core/DynoStats.cpp
@@ -114,8 +114,9 @@ void DynoStats::print(raw_ostream &OS, const DynoStats *Other,
     for (auto &Stat : llvm::reverse(SortedHistogram)) {
       OS << format("%20s,%'18lld", Printer->getOpcodeName(Stat.second).data(),
                    Stat.first * opts::DynoStatsScale);
-
-      MaxOpcodeHistogramTy MaxMultiMap = OpcodeHistogram.at(Stat.second).second;
+      auto It = OpcodeHistogram.find(Stat.second);
+      assert(It != OpcodeHistogram.end());
+      MaxOpcodeHistogramTy MaxMultiMap = It->second.second;
       // Start with function name:BB offset with highest execution count.
       for (auto &Max : llvm::reverse(MaxMultiMap)) {
         OS << format(", %'18lld, ", Max.first * opts::DynoStatsScale)
diff --git a/bolt/lib/Passes/BinaryFunctionCallGraph.cpp b/bolt/lib/Passes/BinaryFunctionCallGraph.cpp
index 2373710c9edd..bbcc9751c0cb 100644
--- a/bolt/lib/Passes/BinaryFunctionCallGraph.cpp
+++ b/bolt/lib/Passes/BinaryFunctionCallGraph.cpp
@@ -56,7 +56,9 @@ std::deque<BinaryFunction *> BinaryFunctionCallGraph::buildTraversalOrder() {
   std::stack<NodeId> Worklist;
 
   for (BinaryFunction *Func : Funcs) {
-    const NodeId Id = FuncToNodeId.at(Func);
+    auto It = FuncToNodeId.find(Func);
+    assert(It != FuncToNodeId.end());
+    const NodeId Id = It->second;
     Worklist.push(Id);
     NodeStatus[Id] = NEW;
   }
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 298ba29ff5b3..2810f723719d 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1390,9 +1390,19 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
     if (Function.isPLTFunction())
       continue;
 
+    // Adjustment for BAT mode: the profile for BOLT split fragments is combined
+    // so only count the hot fragment.
+    const uint64_t Address = Function.getAddress();
+    bool IsHotParentOfBOLTSplitFunction = !Function.getFragments().empty() &&
+                                          BAT && BAT->isBATFunction(Address) &&
+                                          !BAT->fetchParentAddress(Address);
+
     ++NumRegularFunctions;
 
-    if (!Function.isSimple()) {
+    // In BOLTed binaries split functions are non-simple (due to non-relocation
+    // mode), but the original function is known to be simple and we have a
+    // valid profile for it.
+    if (!Function.isSimple() && !IsHotParentOfBOLTSplitFunction) {
       if (Function.hasProfile())
         ++NumNonSimpleProfiledFunctions;
       continue;
@@ -1553,23 +1563,28 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
     const bool Ascending =
         opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending;
 
-    if (SortAll) {
-      llvm::stable_sort(Functions,
-                        [Ascending, &Stats](const BinaryFunction *A,
-                                            const BinaryFunction *B) {
-                          return Ascending ? Stats.at(A) < Stats.at(B)
-                                           : Stats.at(B) < Stats.at(A);
-                        });
-    } else {
-      llvm::stable_sort(
-          Functions, [Ascending, &Stats](const BinaryFunction *A,
-                                         const BinaryFunction *B) {
-            const DynoStats &StatsA = Stats.at(A);
-            const DynoStats &StatsB = Stats.at(B);
-            return Ascending ? StatsA.lessThan(StatsB, opts::PrintSortedBy)
-                             : StatsB.lessThan(StatsA, opts::PrintSortedBy);
-          });
-    }
+    std::function<bool(const DynoStats &, const DynoStats &)>
+        DynoStatsComparator =
+            SortAll ? [](const DynoStats &StatsA,
+                         const DynoStats &StatsB) { return StatsA < StatsB; }
+                    : [](const DynoStats &StatsA, const DynoStats &StatsB) {
+                        return StatsA.lessThan(StatsB, opts::PrintSortedBy);
+                      };
+
+    llvm::stable_sort(Functions,
+                      [Ascending, &Stats, DynoStatsComparator](
+                          const BinaryFunction *A, const BinaryFunction *B) {
+                        auto StatsItr = Stats.find(A);
+                        assert(StatsItr != Stats.end());
+                        const DynoStats &StatsA = StatsItr->second;
+
+                        StatsItr = Stats.find(B);
+                        assert(StatsItr != Stats.end());
+                        const DynoStats &StatsB = StatsItr->second;
+
+                        return Ascending ? DynoStatsComparator(StatsA, StatsB)
+                                         : DynoStatsComparator(StatsB, StatsA);
+                      });
 
     BC.outs() << "BOLT-INFO: top functions sorted by ";
     if (SortAll) {
diff --git a/bolt/lib/Passes/CacheMetrics.cpp b/bolt/lib/Passes/CacheMetrics.cpp
index b02d4303110b..21b420a5c2b0 100644
--- a/bolt/lib/Passes/CacheMetrics.cpp
+++ b/bolt/lib/Passes/CacheMetrics.cpp
@@ -67,7 +67,20 @@ calcTSPScore(const std::vector<BinaryFunction *> &BinaryFunctions,
       for (BinaryBasicBlock *DstBB : SrcBB->successors()) {
         if (SrcBB != DstBB && BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) {
           JumpCount += BI->Count;
-          if (BBAddr.at(SrcBB) + BBSize.at(SrcBB) == BBAddr.at(DstBB))
+
+          auto BBAddrIt = BBAddr.find(SrcBB);
+          assert(BBAddrIt != BBAddr.end());
+          uint64_t SrcBBAddr = BBAddrIt->second;
+
+          auto BBSizeIt = BBSize.find(SrcBB);
+          assert(BBSizeIt != BBSize.end());
+          uint64_t SrcBBSize = BBSizeIt->second;
+
+          BBAddrIt = BBAddr.find(DstBB);
+          assert(BBAddrIt != BBAddr.end());
+          uint64_t DstBBAddr = BBAddrIt->second;
+
+          if (SrcBBAddr + SrcBBSize == DstBBAddr)
             Score += BI->Count;
         }
         ++BI;
@@ -149,20 +162,28 @@ double expectedCacheHitRatio(
   for (BinaryFunction *BF : BinaryFunctions) {
     if (BF->getLayout().block_empty())
       continue;
-    const uint64_t Page =
-        BBAddr.at(BF->getLayout().block_front()) / ITLBPageSize;
-    PageSamples[Page] += FunctionSamples.at(BF);
+    auto BBAddrIt = BBAddr.find(BF->getLayout().block_front());
+    assert(BBAddrIt != BBAddr.end());
+    const uint64_t Page = BBAddrIt->second / ITLBPageSize;
+
+    auto FunctionSamplesIt = FunctionSamples.find(BF);
+    assert(FunctionSamplesIt != FunctionSamples.end());
+    PageSamples[Page] += FunctionSamplesIt->second;
   }
 
   // Computing the expected number of misses for every function
   double Misses = 0;
   for (BinaryFunction *BF : BinaryFunctions) {
     // Skip the function if it has no samples
-    if (BF->getLayout().block_empty() || FunctionSamples.at(BF) == 0.0)
+    auto FunctionSamplesIt = FunctionSamples.find(BF);
+    assert(FunctionSamplesIt != FunctionSamples.end());
+    double Samples = FunctionSamplesIt->second;
+    if (BF->getLayout().block_empty() || Samples == 0.0)
       continue;
-    double Samples = FunctionSamples.at(BF);
-    const uint64_t Page =
-        BBAddr.at(BF->getLayout().block_front()) / ITLBPageSize;
+
+    auto BBAddrIt = BBAddr.find(BF->getLayout().block_front());
+    assert(BBAddrIt != BBAddr.end());
+    const uint64_t Page = BBAddrIt->second / ITLBPageSize;
     // The probability that the page is not present in the cache
     const double MissProb =
         pow(1.0 - PageSamples[Page] / TotalSamples, ITLBEntries);
@@ -170,8 +191,10 @@ double expectedCacheHitRatio(
     // Processing all callers of the function
     for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) {
       BinaryFunction *SrcFunction = Pair.first;
-      const uint64_t SrcPage =
-          BBAddr.at(SrcFunction->getLayout().block_front()) / ITLBPageSize;
+
+      BBAddrIt = BBAddr.find(SrcFunction->getLayout().block_front());
+      assert(BBAddrIt != BBAddr.end());
+      const uint64_t SrcPage = BBAddrIt->second / ITLBPageSize;
       // Is this a 'long' or a 'short' call?
       if (Page != SrcPage) {
         // This is a miss
diff --git a/bolt/lib/Passes/Inliner.cpp b/bolt/lib/Passes/Inliner.cpp
index 84e7d97067b0..f004a8eeea18 100644
--- a/bolt/lib/Passes/Inliner.cpp
+++ b/bolt/lib/Passes/Inliner.cpp
@@ -355,7 +355,9 @@ Inliner::inlineCall(BinaryBasicBlock &CallerBB,
     std::vector<BinaryBasicBlock *> Successors(BB.succ_size());
     llvm::transform(BB.successors(), Successors.begin(),
                     [&InlinedBBMap](const BinaryBasicBlock *BB) {
-                      return InlinedBBMap.at(BB);
+                      auto It = InlinedBBMap.find(BB);
+                      assert(It != InlinedBBMap.end());
+                      return It->second;
                     });
 
     if (CallerFunction.hasValidProfile() && Callee.hasValidProfile())
diff --git a/bolt/lib/Passes/MCF.cpp b/bolt/lib/Passes/MCF.cpp
index c3898d2dce98..77dea7369140 100644
--- a/bolt/lib/Passes/MCF.cpp
+++ b/bolt/lib/Passes/MCF.cpp
@@ -12,9 +12,11 @@
 
 #include "bolt/Passes/MCF.h"
 #include "bolt/Core/BinaryFunction.h"
+#include "bolt/Core/ParallelUtilities.h"
 #include "bolt/Passes/DataflowInfoManager.h"
 #include "bolt/Utils/CommandLineOpts.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include <algorithm>
 #include <vector>
@@ -29,19 +31,10 @@ namespace opts {
 
 extern cl::OptionCategory BoltOptCategory;
 
-extern cl::opt<bool> TimeOpts;
-
 static cl::opt<bool> IterativeGuess(
     "iterative-guess",
     cl::desc("in non-LBR mode, guess edge counts using iterative technique"),
     cl::Hidden, cl::cat(BoltOptCategory));
-
-static cl::opt<bool> UseRArcs(
-    "mcf-use-rarcs",
-    cl::desc("in MCF, consider the possibility of cancelling flow to balance "
-             "edges"),
-    cl::Hidden, cl::cat(BoltOptCategory));
-
 } // namespace opts
 
 namespace llvm {
@@ -441,7 +434,7 @@ void equalizeBBCounts(DataflowInfoManager &Info, BinaryFunction &BF) {
   }
 }
 
-void estimateEdgeCounts(BinaryFunction &BF) {
+void EstimateEdgeCounts::runOnFunction(BinaryFunction &BF) {
   EdgeWeightMap PredEdgeWeights;
   EdgeWeightMap SuccEdgeWeights;
   if (!opts::IterativeGuess) {
@@ -462,8 +455,24 @@ void estimateEdgeCounts(BinaryFunction &BF) {
   recalculateBBCounts(BF, /*AllEdges=*/false);
 }
 
-void solveMCF(BinaryFunction &BF, MCFCostFunction CostFunction) {
-  llvm_unreachable("not implemented");
+Error EstimateEdgeCounts::runOnFunctions(BinaryContext &BC) {
+  if (llvm::none_of(llvm::make_second_range(BC.getBinaryFunctions()),
+                    [](const BinaryFunction &BF) {
+                      return BF.getProfileFlags() == BinaryFunction::PF_SAMPLE;
+                    }))
+    return Error::success();
+
+  ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
+    runOnFunction(BF);
+  };
+  ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
+    return BF.getProfileFlags() != BinaryFunction::PF_SAMPLE;
+  };
+
+  ParallelUtilities::runOnEachFunction(
+      BC, ParallelUtilities::SchedulingPolicy::SP_BB_QUADRATIC, WorkFun,
+      SkipFunc, "EstimateEdgeCounts");
+  return Error::success();
 }
 
 } // namespace bolt
diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp
index 7cfb9c132c2c..cdfca2b9871a 100644
--- a/bolt/lib/Profile/BoltAddressTranslation.cpp
+++ b/bolt/lib/Profile/BoltAddressTranslation.cpp
@@ -20,10 +20,9 @@ namespace bolt {
 
 const char *BoltAddressTranslation::SECTION_NAME = ".note.bolt_bat";
 
-void BoltAddressTranslation::writeEntriesForBB(MapTy &Map,
-                                               const BinaryBasicBlock &BB,
-                                               uint64_t FuncInputAddress,
-                                               uint64_t FuncOutputAddress) {
+void BoltAddressTranslation::writeEntriesForBB(
+    MapTy &Map, const BinaryBasicBlock &BB, uint64_t FuncInputAddress,
+    uint64_t FuncOutputAddress) const {
   const uint64_t BBOutputOffset =
       BB.getOutputAddressRange().first - FuncOutputAddress;
   const uint32_t BBInputOffset = BB.getInputOffset();
@@ -55,7 +54,7 @@ void BoltAddressTranslation::writeEntriesForBB(MapTy &Map,
   // and this deleted block will both share the same output address (the same
   // key), and we need to map back. We choose here to privilege the successor by
   // allowing it to overwrite the previously inserted key in the map.
-  Map[BBOutputOffset] = BBInputOffset << 1;
+  Map.emplace(BBOutputOffset, BBInputOffset << 1);
 
   const auto &IOAddressMap =
       BB.getFunction()->getBinaryContext().getIOAddressMap();
@@ -72,8 +71,7 @@ void BoltAddressTranslation::writeEntriesForBB(MapTy &Map,
 
     LLVM_DEBUG(dbgs() << "  Key: " << Twine::utohexstr(OutputOffset) << " Val: "
                       << Twine::utohexstr(InputOffset) << " (branch)\n");
-    Map.insert(std::pair<uint32_t, uint32_t>(OutputOffset,
-                                             (InputOffset << 1) | BRANCHENTRY));
+    Map.emplace(OutputOffset, (InputOffset << 1) | BRANCHENTRY);
   }
 }
 
@@ -108,6 +106,19 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) {
     for (const BinaryBasicBlock *const BB :
          Function.getLayout().getMainFragment())
       writeEntriesForBB(Map, *BB, InputAddress, OutputAddress);
+    // Add entries for deleted blocks. They are still required for correct BB
+    // mapping of branches modified by SCTC. By convention, they would have the
+    // end of the function as output address.
+    const BBHashMapTy &BBHashMap = getBBHashMap(InputAddress);
+    if (BBHashMap.size() != Function.size()) {
+      const uint64_t EndOffset = Function.getOutputSize();
+      std::unordered_set<uint32_t> MappedInputOffsets;
+      for (const BinaryBasicBlock &BB : Function)
+        MappedInputOffsets.emplace(BB.getInputOffset());
+      for (const auto &[InputOffset, _] : BBHashMap)
+        if (!llvm::is_contained(MappedInputOffsets, InputOffset))
+          Map.emplace(EndOffset, InputOffset << 1);
+    }
     Maps.emplace(Function.getOutputAddress(), std::move(Map));
     ReverseMap.emplace(OutputAddress, InputAddress);
 
@@ -138,8 +149,8 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) {
             << " basic block hashes\n";
 }
 
-APInt BoltAddressTranslation::calculateBranchEntriesBitMask(MapTy &Map,
-                                                            size_t EqualElems) {
+APInt BoltAddressTranslation::calculateBranchEntriesBitMask(
+    MapTy &Map, size_t EqualElems) const {
   APInt BitMask(alignTo(EqualElems, 8), 0);
   size_t Index = 0;
   for (std::pair<const uint32_t, uint32_t> &KeyVal : Map) {
@@ -422,7 +433,7 @@ void BoltAddressTranslation::parseMaps(std::vector<uint64_t> &HotFuncs,
   }
 }
 
-void BoltAddressTranslation::dump(raw_ostream &OS) {
+void BoltAddressTranslation::dump(raw_ostream &OS) const {
   const size_t NumTables = Maps.size();
   OS << "BAT tables for " << NumTables << " functions:\n";
   for (const auto &MapEntry : Maps) {
@@ -447,11 +458,15 @@ void BoltAddressTranslation::dump(raw_ostream &OS) {
         OS << formatv(" hash: {0:x}", BBHashMap.getBBHash(Val));
       OS << "\n";
     }
-    if (IsHotFunction)
-      OS << "NumBlocks: " << NumBasicBlocksMap[Address] << '\n';
-    if (SecondaryEntryPointsMap.count(Address)) {
+    if (IsHotFunction) {
+      auto NumBasicBlocksIt = NumBasicBlocksMap.find(Address);
+      assert(NumBasicBlocksIt != NumBasicBlocksMap.end());
+      OS << "NumBlocks: " << NumBasicBlocksIt->second << '\n';
+    }
+    auto SecondaryEntryPointsIt = SecondaryEntryPointsMap.find(Address);
+    if (SecondaryEntryPointsIt != SecondaryEntryPointsMap.end()) {
       const std::vector<uint32_t> &SecondaryEntryPoints =
-          SecondaryEntryPointsMap[Address];
+          SecondaryEntryPointsIt->second;
       OS << SecondaryEntryPoints.size() << " secondary entry points:\n";
       for (uint32_t EntryPointOffset : SecondaryEntryPoints)
         OS << formatv("{0:x}\n", EntryPointOffset);
@@ -547,13 +562,6 @@ BoltAddressTranslation::getFallthroughsInTrace(uint64_t FuncAddress,
   return Res;
 }
 
-uint64_t BoltAddressTranslation::fetchParentAddress(uint64_t Address) const {
-  auto Iter = ColdPartSource.find(Address);
-  if (Iter == ColdPartSource.end())
-    return 0;
-  return Iter->second;
-}
-
 bool BoltAddressTranslation::enabledFor(
     llvm::object::ELFObjectFileBase *InputFile) const {
   for (const SectionRef &Section : InputFile->sections()) {
diff --git a/bolt/lib/Profile/CMakeLists.txt b/bolt/lib/Profile/CMakeLists.txt
index 045ac47edb95..ca8b9c34e63b 100644
--- a/bolt/lib/Profile/CMakeLists.txt
+++ b/bolt/lib/Profile/CMakeLists.txt
@@ -17,6 +17,5 @@ add_llvm_library(LLVMBOLTProfile
 target_link_libraries(LLVMBOLTProfile
   PRIVATE
   LLVMBOLTCore
-  LLVMBOLTPasses
   LLVMBOLTUtils
   )
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index f55caa7f03f7..ce6ec0a04ac1 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -613,7 +613,6 @@ Error DataAggregator::readProfile(BinaryContext &BC) {
         if (std::error_code EC = writeBATYAML(BC, opts::SaveProfile))
           report_error("cannot create output data file", EC);
     }
-    BC.logBOLTErrorsAndQuitOnFatal(PrintProgramStats().runOnFunctions(BC));
   }
 
   return Error::success();
@@ -673,7 +672,8 @@ DataAggregator::getBATParentFunction(const BinaryFunction &Func) const {
   return nullptr;
 }
 
-StringRef DataAggregator::getLocationName(const BinaryFunction &Func) const {
+StringRef DataAggregator::getLocationName(const BinaryFunction &Func,
+                                          bool BAT) {
   if (!BAT)
     return Func.getOneName();
 
@@ -702,7 +702,7 @@ bool DataAggregator::doSample(BinaryFunction &OrigFunc, uint64_t Address,
   auto I = NamesToSamples.find(Func.getOneName());
   if (I == NamesToSamples.end()) {
     bool Success;
-    StringRef LocName = getLocationName(Func);
+    StringRef LocName = getLocationName(Func, BAT);
     std::tie(I, Success) = NamesToSamples.insert(
         std::make_pair(Func.getOneName(),
                        FuncSampleData(LocName, FuncSampleData::ContainerTy())));
@@ -722,7 +722,7 @@ bool DataAggregator::doIntraBranch(BinaryFunction &Func, uint64_t From,
   FuncBranchData *AggrData = getBranchData(Func);
   if (!AggrData) {
     AggrData = &NamesToBranches[Func.getOneName()];
-    AggrData->Name = getLocationName(Func);
+    AggrData->Name = getLocationName(Func, BAT);
     setBranchData(Func, AggrData);
   }
 
@@ -741,7 +741,7 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
   StringRef SrcFunc;
   StringRef DstFunc;
   if (FromFunc) {
-    SrcFunc = getLocationName(*FromFunc);
+    SrcFunc = getLocationName(*FromFunc, BAT);
     FromAggrData = getBranchData(*FromFunc);
     if (!FromAggrData) {
       FromAggrData = &NamesToBranches[FromFunc->getOneName()];
@@ -752,7 +752,7 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
     recordExit(*FromFunc, From, Mispreds, Count);
   }
   if (ToFunc) {
-    DstFunc = getLocationName(*ToFunc);
+    DstFunc = getLocationName(*ToFunc, BAT);
     ToAggrData = getBranchData(*ToFunc);
     if (!ToAggrData) {
       ToAggrData = &NamesToBranches[ToFunc->getOneName()];
@@ -2340,7 +2340,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
         continue;
       BinaryFunction *BF = BC.getBinaryFunctionAtAddress(FuncAddress);
       assert(BF);
-      YamlBF.Name = getLocationName(*BF);
+      YamlBF.Name = getLocationName(*BF, BAT);
       YamlBF.Id = BF->getFunctionNumber();
       YamlBF.Hash = BAT->getBFHash(FuncAddress);
       YamlBF.ExecCount = BF->getKnownExecutionCount();
@@ -2349,11 +2349,11 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
           BAT->getBBHashMap(FuncAddress);
       YamlBF.Blocks.resize(YamlBF.NumBasicBlocks);
 
-      for (auto &&[Idx, YamlBB] : llvm::enumerate(YamlBF.Blocks))
-        YamlBB.Index = Idx;
-
-      for (auto BI = BlockMap.begin(), BE = BlockMap.end(); BI != BE; ++BI)
-        YamlBF.Blocks[BI->second.getBBIndex()].Hash = BI->second.getBBHash();
+      for (auto &&[Entry, YamlBB] : llvm::zip(BlockMap, YamlBF.Blocks)) {
+        const auto &Block = Entry.second;
+        YamlBB.Hash = Block.Hash;
+        YamlBB.Index = Block.Index;
+      }
 
       // Lookup containing basic block offset and index
       auto getBlock = [&BlockMap](uint32_t Offset) {
@@ -2363,7 +2363,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
           exit(1);
         }
         --BlockIt;
-        return std::pair(BlockIt->first, BlockIt->second.getBBIndex());
+        return std::pair(BlockIt->first, BlockIt->second.Index);
       };
 
       for (const BranchInfo &BI : Branches.Data) {
diff --git a/bolt/lib/Profile/DataReader.cpp b/bolt/lib/Profile/DataReader.cpp
index 06c5e96b7806..f2e999bbfdc6 100644
--- a/bolt/lib/Profile/DataReader.cpp
+++ b/bolt/lib/Profile/DataReader.cpp
@@ -598,8 +598,6 @@ void DataReader::readSampleData(BinaryFunction &BF) {
   }
 
   BF.ExecutionCount = TotalEntryCount;
-
-  estimateEdgeCounts(BF);
 }
 
 void DataReader::convertBranchData(BinaryFunction &BF) const {
diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 016962ff34d8..365bc5389266 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -30,6 +30,7 @@
 #include "llvm/ADT/Bitfields.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Timer.h"
 #include "llvm/Support/xxhash.h"
 #include "llvm/Transforms/Utils/SampleProfileInference.h"
 
@@ -42,6 +43,7 @@ using namespace llvm;
 
 namespace opts {
 
+extern cl::opt<bool> TimeRewrite;
 extern cl::OptionCategory BoltOptCategory;
 
 cl::opt<bool>
@@ -372,8 +374,10 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) {
 
   // Create necessary metadata for the flow function
   for (FlowJump &Jump : Func.Jumps) {
-    Func.Blocks.at(Jump.Source).SuccJumps.push_back(&Jump);
-    Func.Blocks.at(Jump.Target).PredJumps.push_back(&Jump);
+    assert(Jump.Source < Func.Blocks.size());
+    Func.Blocks[Jump.Source].SuccJumps.push_back(&Jump);
+    assert(Jump.Target < Func.Blocks.size());
+    Func.Blocks[Jump.Target].PredJumps.push_back(&Jump);
   }
   return Func;
 }
@@ -705,6 +709,10 @@ void assignProfile(BinaryFunction &BF,
 
 bool YAMLProfileReader::inferStaleProfile(
     BinaryFunction &BF, const yaml::bolt::BinaryFunctionProfile &YamlBF) {
+
+  NamedRegionTimer T("inferStaleProfile", "stale profile inference", "rewrite",
+                     "Rewrite passes", opts::TimeRewrite);
+
   if (!BF.hasCFG())
     return false;
 
diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp
index 29d94067f459..f25f59201f1c 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -102,11 +102,14 @@ bool YAMLProfileReader::parseFunctionProfile(
   if (BF.empty())
     return true;
 
-  if (!opts::IgnoreHash &&
-      YamlBF.Hash != BF.computeHash(IsDFSOrder, HashFunction)) {
-    if (opts::Verbosity >= 1)
-      errs() << "BOLT-WARNING: function hash mismatch\n";
-    ProfileMatched = false;
+  if (!opts::IgnoreHash) {
+    if (!BF.getHash())
+      BF.computeHash(IsDFSOrder, HashFunction);
+    if (YamlBF.Hash != BF.getHash()) {
+      if (opts::Verbosity >= 1)
+        errs() << "BOLT-WARNING: function hash mismatch\n";
+      ProfileMatched = false;
+    }
   }
 
   if (YamlBF.NumBasicBlocks != BF.size()) {
@@ -253,10 +256,8 @@ bool YAMLProfileReader::parseFunctionProfile(
     if (BB.getExecutionCount() == BinaryBasicBlock::COUNT_NO_PROFILE)
       BB.setExecutionCount(0);
 
-  if (YamlBP.Header.Flags & BinaryFunction::PF_SAMPLE) {
+  if (YamlBP.Header.Flags & BinaryFunction::PF_SAMPLE)
     BF.setExecutionCount(FunctionExecutionCount);
-    estimateEdgeCounts(BF);
-  }
 
   ProfileMatched &= !MismatchedBlocks && !MismatchedCalls && !MismatchedEdges;
 
diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp
index ef04ba0d21ad..cf6b61ddd603 100644
--- a/bolt/lib/Profile/YAMLProfileWriter.cpp
+++ b/bolt/lib/Profile/YAMLProfileWriter.cpp
@@ -10,6 +10,7 @@
 #include "bolt/Core/BinaryBasicBlock.h"
 #include "bolt/Core/BinaryFunction.h"
 #include "bolt/Profile/BoltAddressTranslation.h"
+#include "bolt/Profile/DataAggregator.h"
 #include "bolt/Profile/ProfileReaderBase.h"
 #include "bolt/Rewrite/RewriteInstance.h"
 #include "llvm/Support/CommandLine.h"
@@ -39,6 +40,10 @@ const BinaryFunction *YAMLProfileWriter::setCSIDestination(
             BC.getFunctionForSymbol(Symbol, &EntryID)) {
       if (BAT && BAT->isBATFunction(Callee->getAddress()))
         std::tie(Callee, EntryID) = BAT->translateSymbol(BC, *Symbol, Offset);
+      else if (const BinaryBasicBlock *BB =
+                   Callee->getBasicBlockContainingOffset(Offset))
+        BC.getFunctionForSymbol(Callee->getSecondaryEntryPointSymbol(*BB),
+                                &EntryID);
       CSI.DestId = Callee->getFunctionNumber();
       CSI.EntryDiscriminator = EntryID;
       return Callee;
@@ -59,7 +64,7 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS,
   BF.computeHash(UseDFS);
   BF.computeBlockHashes();
 
-  YamlBF.Name = BF.getPrintName();
+  YamlBF.Name = DataAggregator::getLocationName(BF, BAT);
   YamlBF.Id = BF.getFunctionNumber();
   YamlBF.Hash = BF.getHash();
   YamlBF.NumBasicBlocks = BF.size();
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index cbb7199a53dd..aaa0e1ff4d46 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -23,6 +23,7 @@
 #include "bolt/Passes/JTFootprintReduction.h"
 #include "bolt/Passes/LongJmp.h"
 #include "bolt/Passes/LoopInversionPass.h"
+#include "bolt/Passes/MCF.h"
 #include "bolt/Passes/PLTCall.h"
 #include "bolt/Passes/PatchEntries.h"
 #include "bolt/Passes/RegReAssign.h"
@@ -90,6 +91,11 @@ PrintAfterLowering("print-after-lowering",
   cl::desc("print function after instruction lowering"),
   cl::Hidden, cl::cat(BoltOptCategory));
 
+static cl::opt<bool> PrintEstimateEdgeCounts(
+    "print-estimate-edge-counts",
+    cl::desc("print function after edge counts are set for no-LBR profile"),
+    cl::Hidden, cl::cat(BoltOptCategory));
+
 cl::opt<bool>
 PrintFinalized("print-finalized",
   cl::desc("print function after CFG is finalized"),
@@ -334,8 +340,10 @@ Error BinaryFunctionPassManager::runPasses() {
 Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
   BinaryFunctionPassManager Manager(BC);
 
-  const DynoStats InitialDynoStats =
-      getDynoStats(BC.getBinaryFunctions(), BC.isAArch64());
+  Manager.registerPass(
+      std::make_unique<EstimateEdgeCounts>(PrintEstimateEdgeCounts));
+
+  Manager.registerPass(std::make_unique<DynoStatsSetPass>());
 
   Manager.registerPass(std::make_unique<AsmDumpPass>(),
                        opts::AsmDump.getNumOccurrences());
@@ -447,10 +455,9 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
   Manager.registerPass(std::make_unique<SplitFunctions>(PrintSplit));
 
   // Print final dyno stats right while CFG and instruction analysis are intact.
-  Manager.registerPass(
-      std::make_unique<DynoStatsPrintPass>(
-          InitialDynoStats, "after all optimizations before SCTC and FOP"),
-      opts::PrintDynoStats || opts::DynoStatsAll);
+  Manager.registerPass(std::make_unique<DynoStatsPrintPass>(
+                           "after all optimizations before SCTC and FOP"),
+                       opts::PrintDynoStats || opts::DynoStatsAll);
 
   // Add the StokeInfo pass, which extract functions for stoke optimization and
   // get the liveness information for them
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index d582ce7b33a2..ab46503621e9 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -73,8 +73,7 @@ static void printDie(DWARFUnit &DU, uint64_t DIEOffset) {
   DWARFDataExtractor DebugInfoData = DU.getDebugInfoExtractor();
   DWARFDebugInfoEntry DIEEntry;
   if (DIEEntry.extractFast(DU, &DIEOffset, DebugInfoData, NextCUOffset, 0)) {
-    if (const DWARFAbbreviationDeclaration *AbbrDecl =
-            DIEEntry.getAbbreviationDeclarationPtr()) {
+    if (DIEEntry.getAbbreviationDeclarationPtr()) {
       DWARFDie DDie(&DU, &DIEEntry);
       printDie(DDie);
     } else {
diff --git a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
index 99775ccfe38d..b2c8b2446f7e 100644
--- a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
+++ b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
@@ -393,7 +393,7 @@ void LinuxKernelRewriter::processLKKSymtab(bool IsGPL) {
 
   for (uint64_t I = 0; I < SectionSize; I += 4) {
     const uint64_t EntryAddress = SectionAddress + I;
-    ErrorOr<uint64_t> Offset = BC.getSignedValueAtAddress(EntryAddress, 4);
+    ErrorOr<int64_t> Offset = BC.getSignedValueAtAddress(EntryAddress, 4);
     assert(Offset && "Reading valid PC-relative offset for a ksymtab entry");
     const int32_t SignedOffset = *Offset;
     const uint64_t RefAddress = EntryAddress + SignedOffset;
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 6e1021a6df22..4b4913dd7a16 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -17,6 +17,7 @@
 #include "bolt/Core/MCPlusBuilder.h"
 #include "bolt/Core/ParallelUtilities.h"
 #include "bolt/Core/Relocation.h"
+#include "bolt/Passes/BinaryPasses.h"
 #include "bolt/Passes/CacheMetrics.h"
 #include "bolt/Passes/ReorderFunctions.h"
 #include "bolt/Profile/BoltAddressTranslation.h"
@@ -86,6 +87,7 @@ extern cl::list<std::string> ReorderData;
 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
 extern cl::opt<bool> TerminalTrap;
 extern cl::opt<bool> TimeBuild;
+extern cl::opt<bool> TimeRewrite;
 
 cl::opt<bool> AllowStripped("allow-stripped",
                             cl::desc("allow processing of stripped binaries"),
@@ -236,11 +238,6 @@ UseGnuStack("use-gnu-stack",
   cl::cat(BoltCategory));
 
 static cl::opt<bool>
-    TimeRewrite("time-rewrite",
-                cl::desc("print time spent in rewriting passes"), cl::Hidden,
-                cl::cat(BoltCategory));
-
-static cl::opt<bool>
 SequentialDisassembly("sequential-disassembly",
   cl::desc("performs disassembly sequentially"),
   cl::init(false),
@@ -1500,7 +1497,7 @@ void RewriteInstance::registerFragments() {
   if (!BC->hasSymbolsWithFileName()) {
     BC->errs() << "BOLT-ERROR: input file has split functions but does not "
                   "have FILE symbols. If the binary was stripped, preserve "
-                  "FILE symbols with --keep-file-symbols strip option";
+                  "FILE symbols with --keep-file-symbols strip option\n";
     exit(1);
   }
 
@@ -1988,6 +1985,7 @@ Error RewriteInstance::readSpecialSections() {
 
   if (ErrorOr<BinarySection &> BATSec =
           BC->getUniqueSectionByName(BoltAddressTranslation::SECTION_NAME)) {
+    BC->HasBATSection = true;
     // Do not read BAT when plotting a heatmap
     if (!opts::HeatmapMode) {
       if (std::error_code EC = BAT->parse(BC->outs(), BATSec->getContents())) {
@@ -3208,12 +3206,14 @@ void RewriteInstance::preprocessProfileData() {
   if (Error E = ProfileReader->preprocessProfile(*BC.get()))
     report_error("cannot pre-process profile", std::move(E));
 
-  if (!BC->hasSymbolsWithFileName() && ProfileReader->hasLocalsWithFileName()) {
+  if (!BC->hasSymbolsWithFileName() && ProfileReader->hasLocalsWithFileName() &&
+      !opts::AllowStripped) {
     BC->errs()
         << "BOLT-ERROR: input binary does not have local file symbols "
            "but profile data includes function names with embedded file "
            "names. It appears that the input binary was stripped while a "
-           "profiled binary was not\n";
+           "profiled binary was not. If you know what you are doing and "
+           "wish to proceed, use -allow-stripped option.\n";
     exit(1);
   }
 }
@@ -3284,8 +3284,11 @@ void RewriteInstance::processProfileData() {
   // Release memory used by profile reader.
   ProfileReader.reset();
 
-  if (opts::AggregateOnly)
+  if (opts::AggregateOnly) {
+    PrintProgramStats PPS(&*BAT);
+    BC->logBOLTErrorsAndQuitOnFatal(PPS.runOnFunctions(*BC));
     exit(0);
+  }
 }
 
 void RewriteInstance::disassembleFunctions() {
diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
index 8fdacffcb147..a33a9dc8c013 100644
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -1932,6 +1932,19 @@ public:
     //    = R_X86_64_PC32(Ln) + En - JT
     //    = R_X86_64_PC32(Ln + offsetof(En))
     //
+    auto isRIPRel = [&](X86MemOperand &MO) {
+      // NB: DispExpr should be set
+      return MO.DispExpr != nullptr &&
+             MO.BaseRegNum == RegInfo->getProgramCounter() &&
+             MO.IndexRegNum == X86::NoRegister &&
+             MO.SegRegNum == X86::NoRegister;
+    };
+    auto isIndexed = [](X86MemOperand &MO, MCPhysReg R) {
+      // NB: IndexRegNum should be set.
+      return MO.IndexRegNum != X86::NoRegister && MO.BaseRegNum == R &&
+             MO.ScaleImm == 4 && MO.DispImm == 0 &&
+             MO.SegRegNum == X86::NoRegister;
+    };
     LLVM_DEBUG(dbgs() << "Checking for PIC jump table\n");
     MCInst *MemLocInstr = nullptr;
     const MCInst *MovInstr = nullptr;
@@ -1965,9 +1978,8 @@ public:
         std::optional<X86MemOperand> MO = evaluateX86MemoryOperand(Instr);
         if (!MO)
           break;
-        if (MO->BaseRegNum != R1 || MO->ScaleImm != 4 ||
-            MO->IndexRegNum == X86::NoRegister || MO->DispImm != 0 ||
-            MO->SegRegNum != X86::NoRegister)
+        if (!isIndexed(*MO, R1))
+          // POSSIBLE_PIC_JUMP_TABLE
           break;
         MovInstr = &Instr;
       } else {
@@ -1986,9 +1998,7 @@ public:
         std::optional<X86MemOperand> MO = evaluateX86MemoryOperand(Instr);
         if (!MO)
           break;
-        if (MO->BaseRegNum != RegInfo->getProgramCounter() ||
-            MO->IndexRegNum != X86::NoRegister ||
-            MO->SegRegNum != X86::NoRegister || MO->DispExpr == nullptr)
+        if (!isRIPRel(*MO))
           break;
         MemLocInstr = &Instr;
         break;
@@ -2105,13 +2115,15 @@ public:
       return IndirectBranchType::POSSIBLE_FIXED_BRANCH;
     }
 
-    if (Type == IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE &&
-        (MO->ScaleImm != 1 || MO->BaseRegNum != RIPRegister))
-      return IndirectBranchType::UNKNOWN;
-
-    if (Type != IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE &&
-        MO->ScaleImm != PtrSize)
-      return IndirectBranchType::UNKNOWN;
+    switch (Type) {
+    case IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE:
+      if (MO->ScaleImm != 1 || MO->BaseRegNum != RIPRegister)
+        return IndirectBranchType::UNKNOWN;
+      break;
+    default:
+      if (MO->ScaleImm != PtrSize)
+        return IndirectBranchType::UNKNOWN;
+    }
 
     MemLocInstrOut = MemLocInstr;
 
diff --git a/bolt/lib/Utils/CommandLineOpts.cpp b/bolt/lib/Utils/CommandLineOpts.cpp
index ba296c10c00a..41c89bc8aeba 100644
--- a/bolt/lib/Utils/CommandLineOpts.cpp
+++ b/bolt/lib/Utils/CommandLineOpts.cpp
@@ -179,6 +179,10 @@ cl::opt<bool> TimeOpts("time-opts",
                        cl::desc("print time spent in each optimization"),
                        cl::cat(BoltOptCategory));
 
+cl::opt<bool> TimeRewrite("time-rewrite",
+                          cl::desc("print time spent in rewriting passes"),
+                          cl::Hidden, cl::cat(BoltCategory));
+
 cl::opt<bool> UseOldText(
     "use-old-text",
     cl::desc("re-use space in old .text if possible (relocation mode)"),
diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp
index 16e0bbd55f90..d1f8a216badc 100644
--- a/bolt/runtime/instr.cpp
+++ b/bolt/runtime/instr.cpp
@@ -1245,7 +1245,6 @@ void Graph::computeEdgeFrequencies(const uint64_t *Counters,
       continue;
 
     assert(SpanningTreeNodes[Cur].NumInEdges == 1, "must have 1 parent");
-    const uint32_t Parent = SpanningTreeNodes[Cur].InEdges[0].Node;
     const uint32_t ParentEdge = SpanningTreeNodes[Cur].InEdges[0].ID;
 
     // Calculate parent edge freq.
@@ -1464,9 +1463,8 @@ void visitCallFlowEntry(CallFlowHashTable::MapEntry &Entry, int FD,
 int openProfile() {
   // Build the profile name string by appending our PID
   char Buf[BufSize];
-  char *Ptr = Buf;
   uint64_t PID = __getpid();
-  Ptr = strCopy(Buf, __bolt_instr_filename, BufSize);
+  char *Ptr = strCopy(Buf, __bolt_instr_filename, BufSize);
   if (__bolt_instr_use_pid) {
     Ptr = strCopy(Ptr, ".", BufSize - (Ptr - Buf + 1));
     Ptr = intToStr(Ptr, PID, 10);
diff --git a/bolt/test/X86/bb-with-two-tail-calls.s b/bolt/test/X86/bb-with-two-tail-calls.s
index b6703e352ff4..8bbecc498ed7 100644
--- a/bolt/test/X86/bb-with-two-tail-calls.s
+++ b/bolt/test/X86/bb-with-two-tail-calls.s
@@ -8,11 +8,21 @@
 # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib
 # RUN: llvm-bolt %t.exe -o %t.out --data %t.fdata --lite=0 --dyno-stats \
 # RUN:    --print-sctc --print-only=_start -enable-bat 2>&1 | FileCheck %s
+# RUN: llvm-objdump --syms %t.out > %t.log
+# RUN: llvm-bat-dump %t.out --dump-all >> %t.log
+# RUN: FileCheck %s --input-file %t.log --check-prefix=CHECK-BAT
+
 # CHECK-NOT: Assertion `BranchInfo.size() == 2 && "could only be called for blocks with 2 successors"' failed.
 # Two tail calls in the same basic block after SCTC:
 # CHECK:         {{.*}}:   ja      {{.*}} # TAILCALL # Offset: 7 # CTCTakenCount: 4
 # CHECK-NEXT:    {{.*}}:   jmp     {{.*}} # TAILCALL # Offset: 13
 
+# Confirm that a deleted basic block is emitted at function end offset (0xe)
+# CHECK-BAT: [[#%x,ADDR:]] g .text  [[#%x,SIZE:]] _start
+# CHECK-BAT: Function Address: 0x[[#%x,ADDR]]
+# CHECK-BAT: 0x[[#%x,SIZE]]
+# CHECK-BAT: NumBlocks: 5
+
   .globl _start
 _start:
     je x
diff --git a/bolt/test/X86/bolt-address-translation-yaml.test b/bolt/test/X86/bolt-address-translation-yaml.test
index e21513b7dfe5..8f65eaba891e 100644
--- a/bolt/test/X86/bolt-address-translation-yaml.test
+++ b/bolt/test/X86/bolt-address-translation-yaml.test
@@ -31,7 +31,8 @@ RUN: perf2bolt %t.out --pa -p %p/Inputs/blarge_new_bat.preagg.txt -w %t.yaml -o
 RUN:   2>&1 | FileCheck --check-prefix READ-BAT-CHECK %s
 RUN: FileCheck --input-file %t.yaml --check-prefix YAML-BAT-CHECK %s
 # Check that YAML converted from fdata matches YAML created directly with BAT.
-RUN: llvm-bolt %t.exe -data %t.fdata -w %t.yaml-fdata -o /dev/null
+RUN: llvm-bolt %t.exe -data %t.fdata -w %t.yaml-fdata -o /dev/null \
+RUN:   2>&1 | FileCheck --check-prefix READ-BAT-FDATA-CHECK %s
 RUN: FileCheck --input-file %t.yaml-fdata --check-prefix YAML-BAT-CHECK %s
 
 # Test resulting YAML profile with the original binary (no-stale mode)
@@ -40,11 +41,13 @@ RUN:   | FileCheck --check-prefix CHECK-BOLT-YAML %s
 
 WRITE-BAT-CHECK: BOLT-INFO: Wrote 5 BAT maps
 WRITE-BAT-CHECK: BOLT-INFO: Wrote 4 function and 22 basic block hashes
-WRITE-BAT-CHECK: BOLT-INFO: BAT section size (bytes): 384
+WRITE-BAT-CHECK: BOLT-INFO: BAT section size (bytes): 404
 
 READ-BAT-CHECK-NOT: BOLT-ERROR: unable to save profile in YAML format for input file processed by BOLT
 READ-BAT-CHECK: BOLT-INFO: Parsed 5 BAT entries
 READ-BAT-CHECK: PERF2BOLT: read 79 aggregated LBR entries
+READ-BAT-CHECK: BOLT-INFO: 5 out of 21 functions in the binary (23.8%) have non-empty execution profile
+READ-BAT-FDATA-CHECK: BOLT-INFO: 5 out of 16 functions in the binary (31.2%) have non-empty execution profile
 
 YAML-BAT-CHECK:      functions:
 # Function not covered by BAT - has insns in basic block
diff --git a/bolt/test/X86/bolt-address-translation.test b/bolt/test/X86/bolt-address-translation.test
index e6b21c14077b..dfdd1eea3233 100644
--- a/bolt/test/X86/bolt-address-translation.test
+++ b/bolt/test/X86/bolt-address-translation.test
@@ -37,7 +37,7 @@
 # CHECK:      BOLT: 3 out of 7 functions were overwritten.
 # CHECK:      BOLT-INFO: Wrote 6 BAT maps
 # CHECK:      BOLT-INFO: Wrote 3 function and 58 basic block hashes
-# CHECK:      BOLT-INFO: BAT section size (bytes): 928
+# CHECK:      BOLT-INFO: BAT section size (bytes): 940
 #
 # usqrt mappings (hot part). We match against any key (left side containing
 # the bolted binary offsets) because BOLT may change where it puts instructions
diff --git a/bolt/test/X86/dwarf5-debug-names-class-type-decl.s b/bolt/test/X86/dwarf5-debug-names-class-type-decl.s
new file mode 100644
index 000000000000..587eaaf6f4ff
--- /dev/null
+++ b/bolt/test/X86/dwarf5-debug-names-class-type-decl.s
@@ -0,0 +1,670 @@
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %s -o %t1.o
+# RUN: %clang %cflags -dwarf-5 %t1.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt > %t.txt
+# RUN: llvm-dwarfdump --show-form --verbose --debug-names %t.bolt >> %t.txt
+# RUN: cat %t.txt | FileCheck --check-prefix=POSTCHECK %s
+
+## This tests that BOLT doesn't generate entry for a DW_TAG_class_type declaration with DW_AT_name.
+
+# POSTCHECK:       DW_TAG_type_unit
+# POSTCHECK:       DW_TAG_class_type [7]
+# POSTCHECK-NEXT:    DW_AT_name [DW_FORM_strx1]  (indexed (00000006) string = "InnerState")
+# POSTCHECK-NEXT:    DW_AT_declaration [DW_FORM_flag_present]  (true)
+# POSTCHECK: Name Index
+# POSTCHECK-NOT: "InnerState"
+
+## -g2 -O0 -fdebug-types-section -gpubnames
+## namespace A {
+##   namespace B {
+##     class State {
+##       public:
+##       class InnerState{
+##         InnerState() {}
+##       };
+##       State(){}
+##       State(InnerState S){}
+##     };
+##   }
+## }
+##
+## int main() {
+##   A::B::State S;
+##   return 0;
+## }
+
+	.text
+	.file	"main.cpp"
+	.file	0 "/DW_TAG_class_type" "main.cpp" md5 0x80f261b124b76c481b8761c040ab4802
+	.section	.debug_info,"G",@progbits,16664150534606561860,comdat
+.Ltu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	2                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	-1782593539102989756            # Type Signature
+	.long	39                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x3b DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	2                               # Abbrev [2] 0x23:0x2a DW_TAG_namespace
+	.byte	3                               # DW_AT_name
+	.byte	2                               # Abbrev [2] 0x25:0x27 DW_TAG_namespace
+	.byte	4                               # DW_AT_name
+	.byte	3                               # Abbrev [3] 0x27:0x24 DW_TAG_class_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	5                               # DW_AT_name
+	.byte	1                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	4                               # Abbrev [4] 0x2d:0xb DW_TAG_subprogram
+	.byte	5                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	5                               # Abbrev [5] 0x32:0x5 DW_TAG_formal_parameter
+	.long	77                              # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x38:0x10 DW_TAG_subprogram
+	.byte	5                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	9                               # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	5                               # Abbrev [5] 0x3d:0x5 DW_TAG_formal_parameter
+	.long	77                              # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	6                               # Abbrev [6] 0x42:0x5 DW_TAG_formal_parameter
+	.long	72                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	7                               # Abbrev [7] 0x48:0x2 DW_TAG_class_type
+	.byte	6                               # DW_AT_name
+                                        # DW_AT_declaration
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	8                               # Abbrev [8] 0x4d:0x5 DW_TAG_pointer_type
+	.long	39                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.text
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.loc	0 14 0                          # main.cpp:14:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	subq	$16, %rsp
+	movl	$0, -4(%rbp)
+.Ltmp0:
+	.loc	0 15 15 prologue_end            # main.cpp:15:15
+	leaq	-5(%rbp), %rdi
+	callq	_ZN1A1B5StateC2Ev
+	.loc	0 16 3                          # main.cpp:16:3
+	xorl	%eax, %eax
+	.loc	0 16 3 epilogue_begin is_stmt 0 # main.cpp:16:3
+	addq	$16, %rsp
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.section	.text._ZN1A1B5StateC2Ev,"axG",@progbits,_ZN1A1B5StateC2Ev,comdat
+	.weak	_ZN1A1B5StateC2Ev               # -- Begin function _ZN1A1B5StateC2Ev
+	.p2align	4, 0x90
+	.type	_ZN1A1B5StateC2Ev,@function
+_ZN1A1B5StateC2Ev:                      # @_ZN1A1B5StateC2Ev
+.Lfunc_begin1:
+	.loc	0 8 0 is_stmt 1                 # main.cpp:8:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	movq	%rdi, -8(%rbp)
+.Ltmp2:
+	.loc	0 8 15 prologue_end epilogue_begin # main.cpp:8:15
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp3:
+.Lfunc_end1:
+	.size	_ZN1A1B5StateC2Ev, .Lfunc_end1-_ZN1A1B5StateC2Ev
+	.cfi_endproc
+                                        # -- End function
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	57                              # DW_TAG_namespace
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	2                               # DW_TAG_class_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	50                              # DW_AT_accessibility
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	52                              # DW_AT_artificial
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	2                               # DW_TAG_class_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	9                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	85                              # DW_AT_ranges
+	.byte	35                              # DW_FORM_rnglistx
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	116                             # DW_AT_rnglists_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	10                              # Abbreviation Code
+	.byte	2                               # DW_TAG_class_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	11                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	12                              # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	13                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	100                             # DW_AT_object_pointer
+	.byte	19                              # DW_FORM_ref4
+	.byte	110                             # DW_AT_linkage_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	71                              # DW_AT_specification
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	14                              # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	52                              # DW_AT_artificial
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	15                              # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+	.short	5                               # DWARF version number
+	.byte	1                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	9                               # Abbrev [9] 0xc:0x7f DW_TAG_compile_unit
+	.byte	0                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_name
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # DW_AT_comp_dir
+	.quad	0                               # DW_AT_low_pc
+	.byte	0                               # DW_AT_ranges
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+	.long	.Lrnglists_table_base0          # DW_AT_rnglists_base
+	.byte	2                               # Abbrev [2] 0x2b:0x1b DW_TAG_namespace
+	.byte	3                               # DW_AT_name
+	.byte	2                               # Abbrev [2] 0x2d:0x18 DW_TAG_namespace
+	.byte	4                               # DW_AT_name
+	.byte	10                              # Abbrev [10] 0x2f:0x15 DW_TAG_class_type
+                                        # DW_AT_declaration
+	.quad	-1782593539102989756            # DW_AT_signature
+	.byte	4                               # Abbrev [4] 0x38:0xb DW_TAG_subprogram
+	.byte	5                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	5                               # Abbrev [5] 0x3d:0x5 DW_TAG_formal_parameter
+	.long	97                              # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	11                              # Abbrev [11] 0x46:0x1b DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	7                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	14                              # DW_AT_decl_line
+	.long	129                             # DW_AT_type
+                                        # DW_AT_external
+	.byte	12                              # Abbrev [12] 0x55:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	123
+	.byte	10                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	15                              # DW_AT_decl_line
+	.long	47                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	8                               # Abbrev [8] 0x61:0x5 DW_TAG_pointer_type
+	.long	47                              # DW_AT_type
+	.byte	13                              # Abbrev [13] 0x66:0x1b DW_TAG_subprogram
+	.byte	1                               # DW_AT_low_pc
+	.long	.Lfunc_end1-.Lfunc_begin1       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.long	119                             # DW_AT_object_pointer
+	.byte	9                               # DW_AT_linkage_name
+	.long	56                              # DW_AT_specification
+	.byte	14                              # Abbrev [14] 0x77:0x9 DW_TAG_formal_parameter
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	120
+	.byte	11                              # DW_AT_name
+	.long	133                             # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	0                               # End Of Children Mark
+	.byte	15                              # Abbrev [15] 0x81:0x4 DW_TAG_base_type
+	.byte	8                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	8                               # Abbrev [8] 0x85:0x5 DW_TAG_pointer_type
+	.long	47                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end1:
+	.section	.debug_rnglists,"",@progbits
+	.long	.Ldebug_list_header_end0-.Ldebug_list_header_start0 # Length
+.Ldebug_list_header_start0:
+	.short	5                               # Version
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+	.long	1                               # Offset entry count
+.Lrnglists_table_base0:
+	.long	.Ldebug_ranges0-.Lrnglists_table_base0
+.Ldebug_ranges0:
+	.byte	3                               # DW_RLE_startx_length
+	.byte	0                               #   start index
+	.uleb128 .Lfunc_end0-.Lfunc_begin0      #   length
+	.byte	3                               # DW_RLE_startx_length
+	.byte	1                               #   start index
+	.uleb128 .Lfunc_end1-.Lfunc_begin1      #   length
+	.byte	0                               # DW_RLE_end_of_list
+.Ldebug_list_header_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	52                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 19.0.0git"       # string offset=0
+.Linfo_string1:
+	.asciz	"main.cpp"                      # string offset=24
+.Linfo_string2:
+	.asciz	"/home/ayermolo/local/tasks/T190087639/DW_TAG_class_type" # string offset=33
+.Linfo_string3:
+	.asciz	"A"                             # string offset=89
+.Linfo_string4:
+	.asciz	"B"                             # string offset=91
+.Linfo_string5:
+	.asciz	"State"                         # string offset=93
+.Linfo_string6:
+	.asciz	"InnerState"                    # string offset=99
+.Linfo_string7:
+	.asciz	"main"                          # string offset=110
+.Linfo_string8:
+	.asciz	"_ZN1A1B5StateC2Ev"             # string offset=115
+.Linfo_string9:
+	.asciz	"int"                           # string offset=133
+.Linfo_string10:
+	.asciz	"S"                             # string offset=137
+.Linfo_string11:
+	.asciz	"this"                          # string offset=139
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Linfo_string0
+	.long	.Linfo_string1
+	.long	.Linfo_string2
+	.long	.Linfo_string3
+	.long	.Linfo_string4
+	.long	.Linfo_string5
+	.long	.Linfo_string6
+	.long	.Linfo_string7
+	.long	.Linfo_string9
+	.long	.Linfo_string8
+	.long	.Linfo_string10
+	.long	.Linfo_string11
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+	.quad	.Lfunc_begin1
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	1                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	6                               # Header: bucket count
+	.long	6                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	.Ltu_begin0                     # Type unit 0
+	.long	0                               # Bucket 0
+	.long	0                               # Bucket 1
+	.long	1                               # Bucket 2
+	.long	2                               # Bucket 3
+	.long	3                               # Bucket 4
+	.long	6                               # Bucket 5
+	.long	193495088                       # Hash in Bucket 2
+	.long	1059643959                      # Hash in Bucket 3
+	.long	177670                          # Hash in Bucket 4
+	.long	274811398                       # Hash in Bucket 4
+	.long	2090499946                      # Hash in Bucket 4
+	.long	177671                          # Hash in Bucket 5
+	.long	.Linfo_string9                  # String in Bucket 2: int
+	.long	.Linfo_string8                  # String in Bucket 3: _ZN1A1B5StateC2Ev
+	.long	.Linfo_string3                  # String in Bucket 4: A
+	.long	.Linfo_string5                  # String in Bucket 4: State
+	.long	.Linfo_string7                  # String in Bucket 4: main
+	.long	.Linfo_string4                  # String in Bucket 5: B
+	.long	.Lnames5-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames4-.Lnames_entries0       # Offset in Bucket 3
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 4
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 4
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 4
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 5
+.Lnames_abbrev_start0:
+	.byte	1                               # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	2                               # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	3                               # Abbrev code
+	.byte	57                              # DW_TAG_namespace
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	4                               # Abbrev code
+	.byte	57                              # DW_TAG_namespace
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	5                               # Abbrev code
+	.byte	2                               # DW_TAG_class_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	6                               # Abbrev code
+	.byte	57                              # DW_TAG_namespace
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	7                               # Abbrev code
+	.byte	57                              # DW_TAG_namespace
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames5:
+.L2:
+	.byte	1                               # Abbreviation code
+	.long	129                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: int
+.Lnames4:
+.L3:
+	.byte	2                               # Abbreviation code
+	.long	102                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: _ZN1A1B5StateC2Ev
+.Lnames0:
+.L4:
+	.byte	3                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	35                              # DW_IDX_die_offset
+.L7:                                    # DW_IDX_parent
+	.byte	4                               # Abbreviation code
+	.long	43                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: A
+.Lnames2:
+.L1:
+	.byte	5                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	39                              # DW_IDX_die_offset
+	.long	.L5-.Lnames_entries0            # DW_IDX_parent
+	.byte	2                               # Abbreviation code
+	.long	102                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: State
+.Lnames3:
+.L0:
+	.byte	2                               # Abbreviation code
+	.long	70                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: main
+.Lnames1:
+.L5:
+	.byte	6                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	37                              # DW_IDX_die_offset
+	.long	.L4-.Lnames_entries0            # DW_IDX_parent
+.L6:
+	.byte	7                               # Abbreviation code
+	.long	45                              # DW_IDX_die_offset
+	.long	.L7-.Lnames_entries0            # DW_IDX_parent
+	.byte	0                               # End of list: B
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 19.0.0git"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/dwarf5-debug-names-enumeration-type-decl.s b/bolt/test/X86/dwarf5-debug-names-enumeration-type-decl.s
new file mode 100644
index 000000000000..031175763d79
--- /dev/null
+++ b/bolt/test/X86/dwarf5-debug-names-enumeration-type-decl.s
@@ -0,0 +1,485 @@
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %s -o %t1.o
+# RUN: %clang %cflags -dwarf-5 %t1.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt > %t.txt
+# RUN: llvm-dwarfdump --show-form --verbose --debug-names %t.bolt >> %t.txt
+# RUN: cat %t.txt | FileCheck --check-prefix=POSTCHECK %s
+
+## This tests that BOLT doesn't generate entry for a DW_TAG_enumeration_type declaration with DW_AT_name.
+
+# POSTCHECK:       DW_TAG_type_unit
+# POSTCHECK:       DW_TAG_enumeration_type [6]
+# POSTCHECK-NEXT:    DW_AT_name [DW_FORM_strx1]  (indexed (00000009) string = "InnerState")
+# POSTCHECK-NEXT:    DW_AT_byte_size [DW_FORM_data1] (0x04)
+# POSTCHECK-NEXT:    DW_AT_declaration [DW_FORM_flag_present]  (true)
+# POSTCHECK: Name Index
+# POSTCHECK-NOT: "InnerState"
+
+## -g2 -O0 -fdebug-types-section -gpubnames
+## namespace B {
+##   template <typename Task>
+##   class State {
+##     public:
+##     enum class InnerState { STATE0 };
+##     InnerState St;
+##   };
+## }
+##
+## int main() {
+##   B::State<int> S;
+##   return 0;
+## }
+
+	.text
+	.file	"main.cpp"
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.file	0 "/DW_TAG_enumeration_type" "main.cpp" md5 0x2e8962f8ef4bf6eb6f8bd92966c0848b
+	.loc	0 10 0                          # main.cpp:10:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	movl	$0, -4(%rbp)
+.Ltmp0:
+	.loc	0 12 3 prologue_end             # main.cpp:12:3
+	xorl	%eax, %eax
+	.loc	0 12 3 epilogue_begin is_stmt 0 # main.cpp:12:3
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.section	.debug_info,"G",@progbits,8822129917070965541,comdat
+.Ltu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	2                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	8822129917070965541             # Type Signature
+	.long	37                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x2d DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	2                               # Abbrev [2] 0x23:0x1d DW_TAG_namespace
+	.byte	6                               # DW_AT_name
+	.byte	3                               # Abbrev [3] 0x25:0x1a DW_TAG_class_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	10                              # DW_AT_name
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	4                               # Abbrev [4] 0x2b:0x6 DW_TAG_template_type_parameter
+	.long	64                              # DW_AT_type
+	.byte	7                               # DW_AT_name
+	.byte	5                               # Abbrev [5] 0x31:0xa DW_TAG_member
+	.byte	8                               # DW_AT_name
+	.long	59                              # DW_AT_type
+	.byte	0                               # DW_AT_decl_file
+	.byte	6                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	6                               # Abbrev [6] 0x3b:0x3 DW_TAG_enumeration_type
+	.byte	9                               # DW_AT_name
+	.byte	4                               # DW_AT_byte_size
+                                        # DW_AT_declaration
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	7                               # Abbrev [7] 0x40:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	57                              # DW_TAG_namespace
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	2                               # DW_TAG_class_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	47                              # DW_TAG_template_type_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	50                              # DW_AT_accessibility
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	4                               # DW_TAG_enumeration_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	9                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	10                              # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	11                              # Abbreviation Code
+	.byte	2                               # DW_TAG_class_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+	.short	5                               # DWARF version number
+	.byte	1                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Abbrev [8] 0xc:0x43 DW_TAG_compile_unit
+	.byte	0                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_name
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # DW_AT_comp_dir
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+	.byte	9                               # Abbrev [9] 0x23:0x1b DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	3                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	10                              # DW_AT_decl_line
+	.long	62                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	10                              # Abbrev [10] 0x32:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	120
+	.byte	5                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	11                              # DW_AT_decl_line
+	.long	68                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	7                               # Abbrev [7] 0x3e:0x4 DW_TAG_base_type
+	.byte	4                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	2                               # Abbrev [2] 0x42:0xc DW_TAG_namespace
+	.byte	6                               # DW_AT_name
+	.byte	11                              # Abbrev [11] 0x44:0x9 DW_TAG_class_type
+                                        # DW_AT_declaration
+	.quad	8822129917070965541             # DW_AT_signature
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end1:
+	.section	.debug_str_offsets,"",@progbits
+	.long	48                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 19.0.0git"       # string offset=0
+.Linfo_string1:
+	.asciz	"main.cpp"                      # string offset=24
+.Linfo_string2:
+	.asciz	"/home/ayermolo/local/tasks/T190087639/DW_TAG_enumeration_type" # string offset=33
+.Linfo_string3:
+	.asciz	"main"                          # string offset=95
+.Linfo_string4:
+	.asciz	"int"                           # string offset=100
+.Linfo_string5:
+	.asciz	"S"                             # string offset=104
+.Linfo_string6:
+	.asciz	"B"                             # string offset=106
+.Linfo_string7:
+	.asciz	"Task"                          # string offset=108
+.Linfo_string8:
+	.asciz	"St"                            # string offset=113
+.Linfo_string9:
+	.asciz	"InnerState"                    # string offset=116
+.Linfo_string10:
+	.asciz	"State<int>"                    # string offset=127
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Linfo_string0
+	.long	.Linfo_string1
+	.long	.Linfo_string2
+	.long	.Linfo_string3
+	.long	.Linfo_string4
+	.long	.Linfo_string5
+	.long	.Linfo_string6
+	.long	.Linfo_string7
+	.long	.Linfo_string8
+	.long	.Linfo_string9
+	.long	.Linfo_string10
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	1                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	4                               # Header: bucket count
+	.long	4                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	.Ltu_begin0                     # Type unit 0
+	.long	1                               # Bucket 0
+	.long	0                               # Bucket 1
+	.long	2                               # Bucket 2
+	.long	3                               # Bucket 3
+	.long	193495088                       # Hash in Bucket 0
+	.long	2090499946                      # Hash in Bucket 2
+	.long	177671                          # Hash in Bucket 3
+	.long	624407275                       # Hash in Bucket 3
+	.long	.Linfo_string4                  # String in Bucket 0: int
+	.long	.Linfo_string3                  # String in Bucket 2: main
+	.long	.Linfo_string6                  # String in Bucket 3: B
+	.long	.Linfo_string10                 # String in Bucket 3: State<int>
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 0
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 3
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 3
+.Lnames_abbrev_start0:
+	.byte	1                               # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	2                               # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	3                               # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	4                               # Abbrev code
+	.byte	57                              # DW_TAG_namespace
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	5                               # Abbrev code
+	.byte	57                              # DW_TAG_namespace
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	6                               # Abbrev code
+	.byte	2                               # DW_TAG_class_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames1:
+.L0:
+	.byte	1                               # Abbreviation code
+	.long	62                              # DW_IDX_die_offset
+.L2:                                    # DW_IDX_parent
+	.byte	2                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	64                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: int
+.Lnames0:
+.L3:
+	.byte	3                               # Abbreviation code
+	.long	35                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: main
+.Lnames2:
+	.byte	4                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	35                              # DW_IDX_die_offset
+.L1:                                    # DW_IDX_parent
+	.byte	5                               # Abbreviation code
+	.long	66                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: B
+.Lnames3:
+.L4:
+	.byte	6                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	37                              # DW_IDX_die_offset
+	.long	.L3-.Lnames_entries0            # DW_IDX_parent
+	.byte	0                               # End of list: State<int>
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 19.0.0git"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/dwarf5-debug-names-structure-type-decl.s b/bolt/test/X86/dwarf5-debug-names-structure-type-decl.s
new file mode 100644
index 000000000000..6eb2852c26ba
--- /dev/null
+++ b/bolt/test/X86/dwarf5-debug-names-structure-type-decl.s
@@ -0,0 +1,671 @@
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %s -o %t1.o
+# RUN: %clang %cflags -dwarf-5 %t1.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt > %t.txt
+# RUN: llvm-dwarfdump --show-form --verbose --debug-names %t.bolt >> %t.txt
+# RUN: cat %t.txt | FileCheck --check-prefix=POSTCHECK %s
+
+## This tests that BOLT doesn't generate entry for a DW_TAG_structure_type declaration with DW_AT_name.
+
+# POSTCHECK:       DW_TAG_type_unit
+# POSTCHECK:       DW_TAG_structure_type [7]
+# POSTCHECK-NEXT:    DW_AT_name [DW_FORM_strx1]  (indexed (00000006) string = "InnerState")
+# POSTCHECK-NEXT:    DW_AT_declaration [DW_FORM_flag_present]  (true)
+# POSTCHECK: Name Index
+# POSTCHECK-NOT: "InnerState"
+
+## -g2 -O0 -fdebug-types-section -gpubnames
+## namespace A {
+##   namespace B {
+##     class State {
+##       public:
+##       struct InnerState{
+##         InnerState() {}
+##       };
+##       State(){}
+##       State(InnerState S){}
+##     };
+##   }
+## }
+##
+## int main() {
+##   A::B::State S;
+##   return 0;
+## }
+
+
+	.text
+	.file	"main.cpp"
+	.file	0 "/DW_TAG_structure_type" "main.cpp" md5 0xd43ba503b70d00353c195087e1fe16e2
+	.section	.debug_info,"G",@progbits,16664150534606561860,comdat
+.Ltu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	5                               # DWARF version number
+	.byte	2                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.quad	-1782593539102989756            # Type Signature
+	.long	39                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x18:0x3b DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.byte	2                               # Abbrev [2] 0x23:0x2a DW_TAG_namespace
+	.byte	3                               # DW_AT_name
+	.byte	2                               # Abbrev [2] 0x25:0x27 DW_TAG_namespace
+	.byte	4                               # DW_AT_name
+	.byte	3                               # Abbrev [3] 0x27:0x24 DW_TAG_class_type
+	.byte	5                               # DW_AT_calling_convention
+	.byte	5                               # DW_AT_name
+	.byte	1                               # DW_AT_byte_size
+	.byte	0                               # DW_AT_decl_file
+	.byte	3                               # DW_AT_decl_line
+	.byte	4                               # Abbrev [4] 0x2d:0xb DW_TAG_subprogram
+	.byte	5                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	5                               # Abbrev [5] 0x32:0x5 DW_TAG_formal_parameter
+	.long	77                              # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x38:0x10 DW_TAG_subprogram
+	.byte	5                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	9                               # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	5                               # Abbrev [5] 0x3d:0x5 DW_TAG_formal_parameter
+	.long	77                              # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	6                               # Abbrev [6] 0x42:0x5 DW_TAG_formal_parameter
+	.long	72                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	7                               # Abbrev [7] 0x48:0x2 DW_TAG_structure_type
+	.byte	6                               # DW_AT_name
+                                        # DW_AT_declaration
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	8                               # Abbrev [8] 0x4d:0x5 DW_TAG_pointer_type
+	.long	39                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.text
+	.globl	main                            # -- Begin function main
+	.p2align	4, 0x90
+	.type	main,@function
+main:                                   # @main
+.Lfunc_begin0:
+	.loc	0 14 0                          # main.cpp:14:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	subq	$16, %rsp
+	movl	$0, -4(%rbp)
+.Ltmp0:
+	.loc	0 15 15 prologue_end            # main.cpp:15:15
+	leaq	-5(%rbp), %rdi
+	callq	_ZN1A1B5StateC2Ev
+	.loc	0 16 3                          # main.cpp:16:3
+	xorl	%eax, %eax
+	.loc	0 16 3 epilogue_begin is_stmt 0 # main.cpp:16:3
+	addq	$16, %rsp
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp1:
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+                                        # -- End function
+	.section	.text._ZN1A1B5StateC2Ev,"axG",@progbits,_ZN1A1B5StateC2Ev,comdat
+	.weak	_ZN1A1B5StateC2Ev               # -- Begin function _ZN1A1B5StateC2Ev
+	.p2align	4, 0x90
+	.type	_ZN1A1B5StateC2Ev,@function
+_ZN1A1B5StateC2Ev:                      # @_ZN1A1B5StateC2Ev
+.Lfunc_begin1:
+	.loc	0 8 0 is_stmt 1                 # main.cpp:8:0
+	.cfi_startproc
+# %bb.0:                                # %entry
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset %rbp, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register %rbp
+	movq	%rdi, -8(%rbp)
+.Ltmp2:
+	.loc	0 8 15 prologue_end epilogue_begin # main.cpp:8:15
+	popq	%rbp
+	.cfi_def_cfa %rsp, 8
+	retq
+.Ltmp3:
+.Lfunc_end1:
+	.size	_ZN1A1B5StateC2Ev, .Lfunc_end1-_ZN1A1B5StateC2Ev
+	.cfi_endproc
+                                        # -- End function
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	57                              # DW_TAG_namespace
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	2                               # DW_TAG_class_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	50                              # DW_AT_accessibility
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	52                              # DW_AT_artificial
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	8                               # Abbreviation Code
+	.byte	15                              # DW_TAG_pointer_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	9                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	37                              # DW_FORM_strx1
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	114                             # DW_AT_str_offsets_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	37                              # DW_FORM_strx1
+	.byte	17                              # DW_AT_low_pc
+	.byte	1                               # DW_FORM_addr
+	.byte	85                              # DW_AT_ranges
+	.byte	35                              # DW_FORM_rnglistx
+	.byte	115                             # DW_AT_addr_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	116                             # DW_AT_rnglists_base
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	10                              # Abbreviation Code
+	.byte	2                               # DW_TAG_class_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	11                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	12                              # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	13                              # Abbreviation Code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	17                              # DW_AT_low_pc
+	.byte	27                              # DW_FORM_addrx
+	.byte	18                              # DW_AT_high_pc
+	.byte	6                               # DW_FORM_data4
+	.byte	64                              # DW_AT_frame_base
+	.byte	24                              # DW_FORM_exprloc
+	.byte	100                             # DW_AT_object_pointer
+	.byte	19                              # DW_FORM_ref4
+	.byte	110                             # DW_AT_linkage_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	71                              # DW_AT_specification
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	14                              # Abbreviation Code
+	.byte	5                               # DW_TAG_formal_parameter
+	.byte	0                               # DW_CHILDREN_no
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	52                              # DW_AT_artificial
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	15                              # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	37                              # DW_FORM_strx1
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+	.short	5                               # DWARF version number
+	.byte	1                               # DWARF Unit Type
+	.byte	8                               # Address Size (in bytes)
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	9                               # Abbrev [9] 0xc:0x7f DW_TAG_compile_unit
+	.byte	0                               # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.byte	1                               # DW_AT_name
+	.long	.Lstr_offsets_base0             # DW_AT_str_offsets_base
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # DW_AT_comp_dir
+	.quad	0                               # DW_AT_low_pc
+	.byte	0                               # DW_AT_ranges
+	.long	.Laddr_table_base0              # DW_AT_addr_base
+	.long	.Lrnglists_table_base0          # DW_AT_rnglists_base
+	.byte	2                               # Abbrev [2] 0x2b:0x1b DW_TAG_namespace
+	.byte	3                               # DW_AT_name
+	.byte	2                               # Abbrev [2] 0x2d:0x18 DW_TAG_namespace
+	.byte	4                               # DW_AT_name
+	.byte	10                              # Abbrev [10] 0x2f:0x15 DW_TAG_class_type
+                                        # DW_AT_declaration
+	.quad	-1782593539102989756            # DW_AT_signature
+	.byte	4                               # Abbrev [4] 0x38:0xb DW_TAG_subprogram
+	.byte	5                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	8                               # DW_AT_decl_line
+                                        # DW_AT_declaration
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_accessibility
+                                        # DW_ACCESS_public
+	.byte	5                               # Abbrev [5] 0x3d:0x5 DW_TAG_formal_parameter
+	.long	97                              # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	0                               # End Of Children Mark
+	.byte	11                              # Abbrev [11] 0x46:0x1b DW_TAG_subprogram
+	.byte	0                               # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.byte	7                               # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	14                              # DW_AT_decl_line
+	.long	129                             # DW_AT_type
+                                        # DW_AT_external
+	.byte	12                              # Abbrev [12] 0x55:0xb DW_TAG_variable
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	123
+	.byte	10                              # DW_AT_name
+	.byte	0                               # DW_AT_decl_file
+	.byte	15                              # DW_AT_decl_line
+	.long	47                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+	.byte	8                               # Abbrev [8] 0x61:0x5 DW_TAG_pointer_type
+	.long	47                              # DW_AT_type
+	.byte	13                              # Abbrev [13] 0x66:0x1b DW_TAG_subprogram
+	.byte	1                               # DW_AT_low_pc
+	.long	.Lfunc_end1-.Lfunc_begin1       # DW_AT_high_pc
+	.byte	1                               # DW_AT_frame_base
+	.byte	86
+	.long	119                             # DW_AT_object_pointer
+	.byte	9                               # DW_AT_linkage_name
+	.long	56                              # DW_AT_specification
+	.byte	14                              # Abbrev [14] 0x77:0x9 DW_TAG_formal_parameter
+	.byte	2                               # DW_AT_location
+	.byte	145
+	.byte	120
+	.byte	11                              # DW_AT_name
+	.long	133                             # DW_AT_type
+                                        # DW_AT_artificial
+	.byte	0                               # End Of Children Mark
+	.byte	15                              # Abbrev [15] 0x81:0x4 DW_TAG_base_type
+	.byte	8                               # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	8                               # Abbrev [8] 0x85:0x5 DW_TAG_pointer_type
+	.long	47                              # DW_AT_type
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end1:
+	.section	.debug_rnglists,"",@progbits
+	.long	.Ldebug_list_header_end0-.Ldebug_list_header_start0 # Length
+.Ldebug_list_header_start0:
+	.short	5                               # Version
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+	.long	1                               # Offset entry count
+.Lrnglists_table_base0:
+	.long	.Ldebug_ranges0-.Lrnglists_table_base0
+.Ldebug_ranges0:
+	.byte	3                               # DW_RLE_startx_length
+	.byte	0                               #   start index
+	.uleb128 .Lfunc_end0-.Lfunc_begin0      #   length
+	.byte	3                               # DW_RLE_startx_length
+	.byte	1                               #   start index
+	.uleb128 .Lfunc_end1-.Lfunc_begin1      #   length
+	.byte	0                               # DW_RLE_end_of_list
+.Ldebug_list_header_end0:
+	.section	.debug_str_offsets,"",@progbits
+	.long	52                              # Length of String Offsets Set
+	.short	5
+	.short	0
+.Lstr_offsets_base0:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 19.0.0git"       # string offset=0
+.Linfo_string1:
+	.asciz	"main.cpp"                      # string offset=24
+.Linfo_string2:
+	.asciz	"/home/ayermolo/local/tasks/T190087639/DW_TAG_structure_type" # string offset=33
+.Linfo_string3:
+	.asciz	"A"                             # string offset=93
+.Linfo_string4:
+	.asciz	"B"                             # string offset=95
+.Linfo_string5:
+	.asciz	"State"                         # string offset=97
+.Linfo_string6:
+	.asciz	"InnerState"                    # string offset=103
+.Linfo_string7:
+	.asciz	"main"                          # string offset=114
+.Linfo_string8:
+	.asciz	"_ZN1A1B5StateC2Ev"             # string offset=119
+.Linfo_string9:
+	.asciz	"int"                           # string offset=137
+.Linfo_string10:
+	.asciz	"S"                             # string offset=141
+.Linfo_string11:
+	.asciz	"this"                          # string offset=143
+	.section	.debug_str_offsets,"",@progbits
+	.long	.Linfo_string0
+	.long	.Linfo_string1
+	.long	.Linfo_string2
+	.long	.Linfo_string3
+	.long	.Linfo_string4
+	.long	.Linfo_string5
+	.long	.Linfo_string6
+	.long	.Linfo_string7
+	.long	.Linfo_string9
+	.long	.Linfo_string8
+	.long	.Linfo_string10
+	.long	.Linfo_string11
+	.section	.debug_addr,"",@progbits
+	.long	.Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution
+.Ldebug_addr_start0:
+	.short	5                               # DWARF version number
+	.byte	8                               # Address size
+	.byte	0                               # Segment selector size
+.Laddr_table_base0:
+	.quad	.Lfunc_begin0
+	.quad	.Lfunc_begin1
+.Ldebug_addr_end0:
+	.section	.debug_names,"",@progbits
+	.long	.Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+	.short	5                               # Header: version
+	.short	0                               # Header: padding
+	.long	1                               # Header: compilation unit count
+	.long	1                               # Header: local type unit count
+	.long	0                               # Header: foreign type unit count
+	.long	6                               # Header: bucket count
+	.long	6                               # Header: name count
+	.long	.Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+	.long	8                               # Header: augmentation string size
+	.ascii	"LLVM0700"                      # Header: augmentation string
+	.long	.Lcu_begin0                     # Compilation unit 0
+	.long	.Ltu_begin0                     # Type unit 0
+	.long	0                               # Bucket 0
+	.long	0                               # Bucket 1
+	.long	1                               # Bucket 2
+	.long	2                               # Bucket 3
+	.long	3                               # Bucket 4
+	.long	6                               # Bucket 5
+	.long	193495088                       # Hash in Bucket 2
+	.long	1059643959                      # Hash in Bucket 3
+	.long	177670                          # Hash in Bucket 4
+	.long	274811398                       # Hash in Bucket 4
+	.long	2090499946                      # Hash in Bucket 4
+	.long	177671                          # Hash in Bucket 5
+	.long	.Linfo_string9                  # String in Bucket 2: int
+	.long	.Linfo_string8                  # String in Bucket 3: _ZN1A1B5StateC2Ev
+	.long	.Linfo_string3                  # String in Bucket 4: A
+	.long	.Linfo_string5                  # String in Bucket 4: State
+	.long	.Linfo_string7                  # String in Bucket 4: main
+	.long	.Linfo_string4                  # String in Bucket 5: B
+	.long	.Lnames5-.Lnames_entries0       # Offset in Bucket 2
+	.long	.Lnames4-.Lnames_entries0       # Offset in Bucket 3
+	.long	.Lnames0-.Lnames_entries0       # Offset in Bucket 4
+	.long	.Lnames2-.Lnames_entries0       # Offset in Bucket 4
+	.long	.Lnames3-.Lnames_entries0       # Offset in Bucket 4
+	.long	.Lnames1-.Lnames_entries0       # Offset in Bucket 5
+.Lnames_abbrev_start0:
+	.byte	1                               # Abbrev code
+	.byte	36                              # DW_TAG_base_type
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	2                               # Abbrev code
+	.byte	46                              # DW_TAG_subprogram
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	3                               # Abbrev code
+	.byte	57                              # DW_TAG_namespace
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	4                               # Abbrev code
+	.byte	57                              # DW_TAG_namespace
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	25                              # DW_FORM_flag_present
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	5                               # Abbrev code
+	.byte	2                               # DW_TAG_class_type
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	6                               # Abbrev code
+	.byte	57                              # DW_TAG_namespace
+	.byte	2                               # DW_IDX_type_unit
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	7                               # Abbrev code
+	.byte	57                              # DW_TAG_namespace
+	.byte	3                               # DW_IDX_die_offset
+	.byte	19                              # DW_FORM_ref4
+	.byte	4                               # DW_IDX_parent
+	.byte	19                              # DW_FORM_ref4
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev
+	.byte	0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames5:
+.L2:
+	.byte	1                               # Abbreviation code
+	.long	129                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: int
+.Lnames4:
+.L3:
+	.byte	2                               # Abbreviation code
+	.long	102                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: _ZN1A1B5StateC2Ev
+.Lnames0:
+.L4:
+	.byte	3                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	35                              # DW_IDX_die_offset
+.L7:                                    # DW_IDX_parent
+	.byte	4                               # Abbreviation code
+	.long	43                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: A
+.Lnames2:
+.L1:
+	.byte	5                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	39                              # DW_IDX_die_offset
+	.long	.L5-.Lnames_entries0            # DW_IDX_parent
+	.byte	2                               # Abbreviation code
+	.long	102                             # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: State
+.Lnames3:
+.L0:
+	.byte	2                               # Abbreviation code
+	.long	70                              # DW_IDX_die_offset
+	.byte	0                               # DW_IDX_parent
+                                        # End of list: main
+.Lnames1:
+.L5:
+	.byte	6                               # Abbreviation code
+	.byte	0                               # DW_IDX_type_unit
+	.long	37                              # DW_IDX_die_offset
+	.long	.L4-.Lnames_entries0            # DW_IDX_parent
+.L6:
+	.byte	7                               # Abbreviation code
+	.long	45                              # DW_IDX_die_offset
+	.long	.L7-.Lnames_entries0            # DW_IDX_parent
+	.byte	0                               # End of list: B
+	.p2align	2, 0x0
+.Lnames_end0:
+	.ident	"clang version 19.0.0git"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/ignored-interprocedural-reference.s b/bolt/test/X86/ignored-interprocedural-reference.s
new file mode 100644
index 000000000000..12e4fb92adcc
--- /dev/null
+++ b/bolt/test/X86/ignored-interprocedural-reference.s
@@ -0,0 +1,49 @@
+# This reproduces a bug with not processing interprocedural references from
+# ignored functions.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -nostdlib -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.out --enable-bat -funcs=main
+# RUN: link_fdata %s %t.out %t.preagg PREAGG
+# RUN: perf2bolt %t.out -p %t.preagg --pa -o %t.fdata -w %t.yaml
+# RUN: FileCheck %s --input-file=%t.fdata --check-prefix=CHECK-FDATA
+# RUN: FileCheck %s --input-file=%t.yaml --check-prefix=CHECK-YAML
+
+# CHECK-FDATA: 1 main 0 1 foo a 1 1
+# CHECK-YAML: name: main
+# CHECK-YAML: calls: {{.*}} disc: 1
+
+# PREAGG: B #main# #foo_secondary# 1 1
+# main calls foo at valid instruction offset past nops that are to be stripped.
+  .globl main
+main:
+  .cfi_startproc
+  call foo_secondary
+  ret
+  .cfi_endproc
+.size main,.-main
+
+# Placeholder cold fragment to force main to be ignored in non-relocation mode.
+  .globl main.cold
+main.cold:
+  .cfi_startproc
+  ud2
+  .cfi_endproc
+.size main.cold,.-main.cold
+
+# foo is set up to contain a valid instruction at called offset, and trapping
+# instructions past that.
+  .globl foo
+foo:
+  .cfi_startproc
+  .nops 10
+  .globl foo_secondary
+foo_secondary:
+  ret
+  .rept 20
+  int3
+  .endr
+  .cfi_endproc
+.size foo,.-foo
diff --git a/bolt/test/X86/register-fragments-bolt-symbols.s b/bolt/test/X86/register-fragments-bolt-symbols.s
index 6478adf19372..90c402b2234d 100644
--- a/bolt/test/X86/register-fragments-bolt-symbols.s
+++ b/bolt/test/X86/register-fragments-bolt-symbols.s
@@ -18,6 +18,11 @@
 # RUN: FileCheck --input-file %t.bat.fdata --check-prefix=CHECK-FDATA %s
 # RUN: FileCheck --input-file %t.bat.yaml --check-prefix=CHECK-YAML %s
 
+# RUN: link_fdata --no-redefine %s %t.bolt %t.preagg2 PREAGG2
+# PREAGG2: B X:0 #chain# 1 0
+# RUN: perf2bolt %t.bolt -p %t.preagg2 --pa -o %t.bat2.fdata -w %t.bat2.yaml
+# RUN: FileCheck %s --input-file %t.bat2.yaml --check-prefix=CHECK-YAML2
+
 # CHECK-SYMS: l df *ABS*          [[#]] chain.s
 # CHECK-SYMS: l  F .bolt.org.text [[#]] chain
 # CHECK-SYMS: l  F .text.cold     [[#]] chain.cold.0
@@ -28,6 +33,9 @@
 
 # CHECK-FDATA: 0 [unknown] 0 1 chain/chain.s/2 10 0 1
 # CHECK-YAML: - name: 'chain/chain.s/2'
+# CHECK-YAML2: - name: 'chain/chain.s/1'
+## non-BAT function has non-zero insns:
+# CHECK-YAML2: insns: 1
 
 .file "chain.s"
         .text
diff --git a/bolt/test/link_fdata.py b/bolt/test/link_fdata.py
index 0232dd3211e9..3837e394ccc8 100755
--- a/bolt/test/link_fdata.py
+++ b/bolt/test/link_fdata.py
@@ -19,6 +19,7 @@ parser.add_argument("output")
 parser.add_argument("prefix", nargs="?", default="FDATA", help="Custom FDATA prefix")
 parser.add_argument("--nmtool", default="nm", help="Path to nm tool")
 parser.add_argument("--no-lbr", action="store_true")
+parser.add_argument("--no-redefine", action="store_true")
 
 args = parser.parse_args()
 
@@ -90,6 +91,8 @@ nm_output = subprocess.run(
 symbols = {}
 for symline in nm_output.splitlines():
     symval, _, symname = symline.split(maxsplit=2)
+    if symname in symbols and args.no_redefine:
+        continue
     symbols[symname] = symval
 
 
diff --git a/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp
index 36687a8e761e..c87b3ea7e261 100644
--- a/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.cpp
@@ -54,7 +54,9 @@ AST_MATCHER(QualType, isEnableIf) {
 AST_MATCHER_P(TemplateTypeParmDecl, hasDefaultArgument,
               clang::ast_matchers::internal::Matcher<QualType>, TypeMatcher) {
   return Node.hasDefaultArgument() &&
-         TypeMatcher.matches(Node.getDefaultArgument(), Finder, Builder);
+         TypeMatcher.matches(
+             Node.getDefaultArgument().getArgument().getAsType(), Finder,
+             Builder);
 }
 AST_MATCHER(TemplateDecl, hasAssociatedConstraints) {
   return Node.hasAssociatedConstraints();
diff --git a/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.cpp
index 09aaf3e31d5d..75f1107904fc 100644
--- a/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.cpp
@@ -19,10 +19,11 @@ namespace {
 AST_MATCHER_P(TemplateTypeParmDecl, hasUnnamedDefaultArgument,
               ast_matchers::internal::Matcher<TypeLoc>, InnerMatcher) {
   if (Node.getIdentifier() != nullptr || !Node.hasDefaultArgument() ||
-      Node.getDefaultArgumentInfo() == nullptr)
+      Node.getDefaultArgument().getArgument().isNull())
     return false;
 
-  TypeLoc DefaultArgTypeLoc = Node.getDefaultArgumentInfo()->getTypeLoc();
+  TypeLoc DefaultArgTypeLoc =
+      Node.getDefaultArgument().getTypeSourceInfo()->getTypeLoc();
   return InnerMatcher.matches(DefaultArgTypeLoc, Finder, Builder);
 }
 
diff --git a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
index a1cffbc66619..5e64d23874ec 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
@@ -144,16 +144,13 @@ void SizeofExpressionCheck::registerMatchers(MatchFinder *Finder) {
         unaryOperator(hasUnaryOperand(ArrayExpr), unless(hasOperatorName("*"))),
         binaryOperator(hasEitherOperand(ArrayExpr)),
         castExpr(hasSourceExpression(ArrayExpr))));
-    const auto PointerToArrayExpr = ignoringParenImpCasts(
-        hasType(hasCanonicalType(pointerType(pointee(arrayType())))));
+    const auto PointerToArrayExpr =
+        hasType(hasCanonicalType(pointerType(pointee(arrayType()))));
 
-    const auto StructAddrOfExpr = unaryOperator(
-        hasOperatorName("&"), hasUnaryOperand(ignoringParenImpCasts(
-                                  hasType(hasCanonicalType(recordType())))));
     const auto PointerToStructType =
         hasUnqualifiedDesugaredType(pointerType(pointee(recordType())));
-    const auto PointerToStructExpr = ignoringParenImpCasts(expr(
-        hasType(hasCanonicalType(PointerToStructType)), unless(cxxThisExpr())));
+    const auto PointerToStructExpr = expr(
+        hasType(hasCanonicalType(PointerToStructType)), unless(cxxThisExpr()));
 
     const auto ArrayOfPointersExpr = ignoringParenImpCasts(
         hasType(hasCanonicalType(arrayType(hasElementType(pointerType()))
@@ -166,18 +163,19 @@ void SizeofExpressionCheck::registerMatchers(MatchFinder *Finder) {
         ignoringParenImpCasts(arraySubscriptExpr(
             hasBase(ArrayOfSamePointersExpr), hasIndex(ZeroLiteral)));
     const auto ArrayLengthExprDenom =
-        expr(hasParent(expr(ignoringParenImpCasts(binaryOperator(
-                 hasOperatorName("/"), hasLHS(ignoringParenImpCasts(sizeOfExpr(
-                                           has(ArrayOfPointersExpr)))))))),
+        expr(hasParent(binaryOperator(hasOperatorName("/"),
+                                      hasLHS(ignoringParenImpCasts(sizeOfExpr(
+                                          has(ArrayOfPointersExpr)))))),
              sizeOfExpr(has(ArrayOfSamePointersZeroSubscriptExpr)));
 
-    Finder->addMatcher(expr(anyOf(sizeOfExpr(has(ignoringParenImpCasts(anyOf(
-                                      ArrayCastExpr, PointerToArrayExpr,
-                                      StructAddrOfExpr, PointerToStructExpr)))),
-                                  sizeOfExpr(has(PointerToStructType))),
-                            unless(ArrayLengthExprDenom))
-                           .bind("sizeof-pointer-to-aggregate"),
-                       this);
+    Finder->addMatcher(
+        expr(sizeOfExpr(anyOf(
+                 has(ignoringParenImpCasts(anyOf(
+                     ArrayCastExpr, PointerToArrayExpr, PointerToStructExpr))),
+                 has(PointerToStructType))),
+             unless(ArrayLengthExprDenom))
+            .bind("sizeof-pointer-to-aggregate"),
+        this);
   }
 
   // Detect expression like: sizeof(expr) <= k for a suspicious constant 'k'.
diff --git a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
index 7a021fe14436..ea4d99586c71 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
@@ -177,9 +177,11 @@ matchTrailingTemplateParam(const FunctionTemplateDecl *FunctionTemplate) {
           dyn_cast<TemplateTypeParmDecl>(LastParam)) {
     if (LastTemplateParam->hasDefaultArgument() &&
         LastTemplateParam->getIdentifier() == nullptr) {
-      return {matchEnableIfSpecialization(
-                  LastTemplateParam->getDefaultArgumentInfo()->getTypeLoc()),
-              LastTemplateParam};
+      return {
+          matchEnableIfSpecialization(LastTemplateParam->getDefaultArgument()
+                                          .getTypeSourceInfo()
+                                          ->getTypeLoc()),
+          LastTemplateParam};
     }
   }
   return {};
diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
index 74152c603451..28f5eada6d82 100644
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
@@ -50,7 +50,9 @@ StringRef getZeroLiteralToCompareWithForType(CastKind CastExprKind,
 
   case CK_PointerToBoolean:
   case CK_MemberPointerToBoolean: // Fall-through on purpose.
-    return Context.getLangOpts().CPlusPlus11 ? "nullptr" : "0";
+    return (Context.getLangOpts().CPlusPlus11 || Context.getLangOpts().C23)
+               ? "nullptr"
+               : "0";
 
   default:
     llvm_unreachable("Unexpected cast kind");
@@ -165,6 +167,12 @@ bool needsSpacePrefix(SourceLocation Loc, ASTContext &Context) {
 void fixGenericExprCastFromBool(DiagnosticBuilder &Diag,
                                 const ImplicitCastExpr *Cast,
                                 ASTContext &Context, StringRef OtherType) {
+  if (!Context.getLangOpts().CPlusPlus) {
+    Diag << FixItHint::CreateInsertion(Cast->getBeginLoc(),
+                                       (Twine("(") + OtherType + ")").str());
+    return;
+  }
+
   const Expr *SubExpr = Cast->getSubExpr();
   const bool NeedParens = !isa<ParenExpr>(SubExpr->IgnoreImplicit());
   const bool NeedSpace = needsSpacePrefix(Cast->getBeginLoc(), Context);
@@ -267,6 +275,10 @@ void ImplicitBoolConversionCheck::registerMatchers(MatchFinder *Finder) {
   auto BoolXor =
       binaryOperator(hasOperatorName("^"), hasLHS(ImplicitCastFromBool),
                      hasRHS(ImplicitCastFromBool));
+  auto ComparisonInCall = allOf(
+      hasParent(callExpr()),
+      hasSourceExpression(binaryOperator(hasAnyOperatorName("==", "!="))));
+
   Finder->addMatcher(
       traverse(TK_AsIs,
                implicitCastExpr(
@@ -281,6 +293,8 @@ void ImplicitBoolConversionCheck::registerMatchers(MatchFinder *Finder) {
                        stmt(anyOf(ifStmt(), whileStmt()), has(declStmt())))),
                    // Exclude cases common to implicit cast to and from bool.
                    unless(ExceptionCases), unless(has(BoolXor)),
+                   // Exclude C23 cases common to implicit cast to bool.
+                   unless(ComparisonInCall),
                    // Retrieve also parent statement, to check if we need
                    // additional parens in replacement.
                    optionally(hasParent(stmt().bind("parentStmt"))),
diff --git a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
index e811f5519de2..88e4886cd0df 100644
--- a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp
@@ -123,6 +123,9 @@ static const NamedDecl *getFailureForNamedDecl(const NamedDecl *ND) {
   if (const auto *Method = dyn_cast<CXXMethodDecl>(ND)) {
     if (const CXXMethodDecl *Overridden = getOverrideMethod(Method))
       Canonical = cast<NamedDecl>(Overridden->getCanonicalDecl());
+    else if (const FunctionTemplateDecl *Primary = Method->getPrimaryTemplate())
+      if (const FunctionDecl *TemplatedDecl = Primary->getTemplatedDecl())
+        Canonical = cast<NamedDecl>(TemplatedDecl->getCanonicalDecl());
 
     if (Canonical != ND)
       return Canonical;
diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp
index 06b949bc4a2b..de103e011c70 100644
--- a/clang-tools-extra/clangd/Hover.cpp
+++ b/clang-tools-extra/clangd/Hover.cpp
@@ -247,8 +247,12 @@ fetchTemplateParameters(const TemplateParameterList *Params,
       if (!TTP->getName().empty())
         P.Name = TTP->getNameAsString();
 
-      if (TTP->hasDefaultArgument())
-        P.Default = TTP->getDefaultArgument().getAsString(PP);
+      if (TTP->hasDefaultArgument()) {
+        P.Default.emplace();
+        llvm::raw_string_ostream Out(*P.Default);
+        TTP->getDefaultArgument().getArgument().print(PP, Out,
+                                                      /*IncludeType=*/false);
+      }
     } else if (const auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(Param)) {
       P.Type = printType(NTTP, PP);
 
@@ -258,7 +262,8 @@ fetchTemplateParameters(const TemplateParameterList *Params,
       if (NTTP->hasDefaultArgument()) {
         P.Default.emplace();
         llvm::raw_string_ostream Out(*P.Default);
-        NTTP->getDefaultArgument()->printPretty(Out, nullptr, PP);
+        NTTP->getDefaultArgument().getArgument().print(PP, Out,
+                                                       /*IncludeType=*/false);
       }
     } else if (const auto *TTPD = dyn_cast<TemplateTemplateParmDecl>(Param)) {
       P.Type = printType(TTPD, PP);
diff --git a/clang-tools-extra/clangd/test/infinite-instantiation.test b/clang-tools-extra/clangd/test/infinite-instantiation.test
index d379a9c2d523..a9c787c77027 100644
--- a/clang-tools-extra/clangd/test/infinite-instantiation.test
+++ b/clang-tools-extra/clangd/test/infinite-instantiation.test
@@ -1,5 +1,5 @@
 // RUN: rm -rf %t.dir && mkdir -p %t.dir
-// RUN: echo '[{"directory": "%/t.dir", "command": "clang -ftemplate-depth=100 -x c++ %s", "file": "%/s"}]' > %t.dir/compile_commands.json
+// RUN: echo '[{"directory": "%/t.dir", "command": "clang -ftemplate-depth=100 -x c++ %/s", "file": "%/s"}]' > %t.dir/compile_commands.json
 // RUN: not clangd --compile-commands-dir=%t.dir -check=%s 2>&1 | FileCheck -strict-whitespace %s
 
 // CHECK: [template_recursion_depth_exceeded]
diff --git a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
index 0b2273f0a9a6..3220a5a6a982 100644
--- a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
@@ -836,7 +836,9 @@ TEST_F(TargetDeclTest, OverloadExpr) {
       [[delete]] x;
     }
   )cpp";
-  EXPECT_DECLS("CXXDeleteExpr", "void operator delete(void *) noexcept");
+  // Sized deallocation is enabled by default in C++14 onwards.
+  EXPECT_DECLS("CXXDeleteExpr",
+               "void operator delete(void *, unsigned long) noexcept");
 }
 
 TEST_F(TargetDeclTest, DependentExprs) {
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 6a9892bada91..3e3195f6f681 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -375,12 +375,15 @@ Changes in existing checks
   <clang-tidy/checks/readability/identifier-naming>` check in `GetConfigPerFile`
   mode by resolving symbolic links to header files. Fixed handling of Hungarian
   Prefix when configured to `LowerCase`. Added support for renaming designated
-  initializers. Added support for renaming macro arguments.
+  initializers. Added support for renaming macro arguments. Fixed renaming
+  conflicts arising from out-of-line member function template definitions.
 
 - Improved :doc:`readability-implicit-bool-conversion
   <clang-tidy/checks/readability/implicit-bool-conversion>` check to provide
   valid fix suggestions for ``static_cast`` without a preceding space and
-  fixed problem with duplicate parentheses in double implicit casts.
+  fixed problem with duplicate parentheses in double implicit casts. Corrected
+  the fix suggestions for C23 and later by using C-style casts instead of
+  ``static_cast``.
 
 - Improved :doc:`readability-redundant-inline-specifier
   <clang-tidy/checks/readability/redundant-inline-specifier>` check to properly
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/implicit-bool-conversion.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/implicit-bool-conversion.rst
index 1ea67a0b55e9..1ab21ffeb422 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/implicit-bool-conversion.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/implicit-bool-conversion.rst
@@ -96,8 +96,8 @@ The rules for generating fix-it hints are:
   - ``if (!pointer)`` is changed to ``if (pointer == nullptr)``,
 
 - in case of conversions from bool to other built-in types, an explicit
-  ``static_cast`` is proposed to make it clear that a conversion is taking
-  place:
+  ``static_cast`` (or a C-style cast since C23) is proposed to make it clear
+  that a conversion is taking place:
 
   - ``int integer = boolean;`` is changed to
     ``int integer = static_cast<int>(boolean);``,
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/new-delete-overloads.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/new-delete-overloads.cpp
index 78f021144b2e..f86fe8a4c5b1 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/new-delete-overloads.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/new-delete-overloads.cpp
@@ -12,16 +12,6 @@ struct S {
 // CHECK-MESSAGES: :[[@LINE+1]]:7: warning: declaration of 'operator new' has no matching declaration of 'operator delete' at the same scope
 void *operator new(size_t size) noexcept(false);
 
-struct T {
-  // Sized deallocations are not enabled by default, and so this new/delete pair
-  // does not match. However, we expect only one warning, for the new, because
-  // the operator delete is a placement delete and we do not warn on mismatching
-  // placement operations.
-  // CHECK-MESSAGES: :[[@LINE+1]]:9: warning: declaration of 'operator new' has no matching declaration of 'operator delete' at the same scope
-  void *operator new(size_t size) noexcept;
-  void operator delete(void *ptr, size_t) noexcept; // ok only if sized deallocation is enabled
-};
-
 struct U {
   void *operator new(size_t size) noexcept;
   void operator delete(void *ptr) noexcept;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming-outofline.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming-outofline.cpp
new file mode 100644
index 000000000000..f807875e2769
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming-outofline.cpp
@@ -0,0 +1,30 @@
+// RUN: %check_clang_tidy %s readability-identifier-naming %t -std=c++20 \
+// RUN:   --config='{CheckOptions: { \
+// RUN:     readability-identifier-naming.MethodCase: CamelCase, \
+// RUN:  }}'
+
+namespace SomeNamespace {
+namespace Inner {
+
+class SomeClass {
+public:
+    template <typename T>
+    int someMethod();
+// CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for method 'someMethod' [readability-identifier-naming]
+// CHECK-FIXES: {{^}}    int SomeMethod();
+};
+template <typename T>
+int SomeClass::someMethod() {
+// CHECK-FIXES: {{^}}int SomeClass::SomeMethod() {
+    return 5;
+}
+
+} // namespace Inner
+
+void someFunc() {
+    Inner::SomeClass S;
+    S.someMethod<int>();
+// CHECK-FIXES: {{^}}    S.SomeMethod<int>();
+}
+
+} // namespace SomeNamespace
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.c b/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.c
new file mode 100644
index 000000000000..a8c69858f76b
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.c
@@ -0,0 +1,354 @@
+// RUN: %check_clang_tidy %s readability-implicit-bool-conversion %t -- -- -std=c23
+
+#undef NULL
+#define NULL 0L
+
+void functionTakingBool(bool);
+void functionTakingInt(int);
+void functionTakingUnsignedLong(unsigned long);
+void functionTakingChar(char);
+void functionTakingFloat(float);
+void functionTakingDouble(double);
+void functionTakingSignedChar(signed char);
+
+
+////////// Implicit conversion from bool.
+
+void implicitConversionFromBoolSimpleCases() {
+  bool boolean = true;
+
+  functionTakingBool(boolean);
+
+  functionTakingInt(boolean);
+  // CHECK-MESSAGES: :[[@LINE-1]]:21: warning: implicit conversion 'bool' -> 'int' [readability-implicit-bool-conversion]
+  // CHECK-FIXES: functionTakingInt((int)boolean);
+
+  functionTakingUnsignedLong(boolean);
+  // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: implicit conversion 'bool' -> 'unsigned long'
+  // CHECK-FIXES: functionTakingUnsignedLong((unsigned long)boolean);
+
+  functionTakingChar(boolean);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'bool' -> 'char'
+  // CHECK-FIXES: functionTakingChar((char)boolean);
+
+  functionTakingFloat(boolean);
+  // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: implicit conversion 'bool' -> 'float'
+  // CHECK-FIXES: functionTakingFloat((float)boolean);
+
+  functionTakingDouble(boolean);
+  // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: implicit conversion 'bool' -> 'double'
+  // CHECK-FIXES: functionTakingDouble((double)boolean);
+}
+
+float implicitConversionFromBoolInReturnValue() {
+  bool boolean = false;
+  return boolean;
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: implicit conversion 'bool' -> 'float'
+  // CHECK-FIXES: return (float)boolean;
+}
+
+void implicitConversionFromBoolInSingleBoolExpressions(bool b1, bool b2) {
+  bool boolean = true;
+  boolean = b1 ^ b2;
+  boolean |= !b1 || !b2;
+  boolean &= b1;
+
+  int integer = boolean - 3;
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: implicit conversion 'bool' -> 'int'
+  // CHECK-FIXES: int integer = (int)boolean - 3;
+
+  float floating = boolean / 0.3f;
+  // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: implicit conversion 'bool' -> 'float'
+  // CHECK-FIXES: float floating = (float)boolean / 0.3f;
+
+  char character = boolean;
+  // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: implicit conversion 'bool' -> 'char'
+  // CHECK-FIXES: char character = (char)boolean;
+}
+
+void implicitConversionFromBoolInComplexBoolExpressions() {
+  bool boolean = true;
+  bool anotherBoolean = false;
+
+  int integer = boolean && anotherBoolean;
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: implicit conversion 'bool' -> 'int'
+  // CHECK-MESSAGES: :[[@LINE-2]]:28: warning: implicit conversion 'bool' -> 'int'
+  // CHECK-FIXES: int integer = (int)boolean && (int)anotherBoolean;
+
+  float floating = (boolean || anotherBoolean) * 0.3f;
+  // CHECK-MESSAGES: :[[@LINE-1]]:21: warning: implicit conversion 'bool' -> 'int'
+  // CHECK-MESSAGES: :[[@LINE-2]]:32: warning: implicit conversion 'bool' -> 'int'
+  // CHECK-FIXES: float floating = ((int)boolean || (int)anotherBoolean) * 0.3f;
+
+  double doubleFloating = (boolean && (anotherBoolean || boolean)) * 0.3;
+  // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: implicit conversion 'bool' -> 'int'
+  // CHECK-MESSAGES: :[[@LINE-2]]:40: warning: implicit conversion 'bool' -> 'int'
+  // CHECK-MESSAGES: :[[@LINE-3]]:58: warning: implicit conversion 'bool' -> 'int'
+  // CHECK-FIXES: double doubleFloating = ((int)boolean && ((int)anotherBoolean || (int)boolean)) * 0.3;
+}
+
+void implicitConversionFromBoolLiterals() {
+  functionTakingInt(true);
+  // CHECK-MESSAGES: :[[@LINE-1]]:21: warning: implicit conversion 'bool' -> 'int'
+  // CHECK-FIXES: functionTakingInt(1);
+
+  functionTakingUnsignedLong(false);
+  // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: implicit conversion 'bool' -> 'unsigned long'
+  // CHECK-FIXES: functionTakingUnsignedLong(0u);
+
+  functionTakingSignedChar(true);
+  // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: implicit conversion 'bool' -> 'signed char'
+  // CHECK-FIXES: functionTakingSignedChar(1);
+
+  functionTakingFloat(false);
+  // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: implicit conversion 'bool' -> 'float'
+  // CHECK-FIXES: functionTakingFloat(0.0f);
+
+  functionTakingDouble(true);
+  // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: implicit conversion 'bool' -> 'double'
+  // CHECK-FIXES: functionTakingDouble(1.0);
+}
+
+void implicitConversionFromBoolInComparisons() {
+  bool boolean = true;
+  int integer = 0;
+
+  functionTakingBool(boolean == integer);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'bool' -> 'int'
+  // CHECK-FIXES: functionTakingBool((int)boolean == integer);
+
+  functionTakingBool(integer != boolean);
+  // CHECK-MESSAGES: :[[@LINE-1]]:33: warning: implicit conversion 'bool' -> 'int'
+  // CHECK-FIXES: functionTakingBool(integer != (int)boolean);
+}
+
+void ignoreBoolComparisons() {
+  bool boolean = true;
+  bool anotherBoolean = false;
+
+  functionTakingBool(boolean == anotherBoolean);
+  functionTakingBool(boolean != anotherBoolean);
+}
+
+void ignoreExplicitCastsFromBool() {
+  bool boolean = true;
+
+  int integer = (int)boolean + 3;
+  float floating = (float)boolean * 0.3f;
+  char character = (char)boolean;
+}
+
+void ignoreImplicitConversionFromBoolInMacroExpansions() {
+  bool boolean = true;
+
+  #define CAST_FROM_BOOL_IN_MACRO_BODY boolean + 3
+  int integerFromMacroBody = CAST_FROM_BOOL_IN_MACRO_BODY;
+
+  #define CAST_FROM_BOOL_IN_MACRO_ARGUMENT(x) x + 3
+  int integerFromMacroArgument = CAST_FROM_BOOL_IN_MACRO_ARGUMENT(boolean);
+}
+
+////////// Implicit conversions to bool.
+
+void implicitConversionToBoolSimpleCases() {
+  int integer = 10;
+  functionTakingBool(integer);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'int' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(integer != 0);
+
+  unsigned long unsignedLong = 10;
+  functionTakingBool(unsignedLong);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'unsigned long' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(unsignedLong != 0u);
+
+  float floating = 0.0f;
+  functionTakingBool(floating);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'float' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(floating != 0.0f);
+
+  double doubleFloating = 1.0f;
+  functionTakingBool(doubleFloating);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'double' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(doubleFloating != 0.0);
+
+  signed char character = 'a';
+  functionTakingBool(character);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'signed char' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(character != 0);
+
+  int* pointer = nullptr;
+  functionTakingBool(pointer);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'int *' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(pointer != nullptr);
+}
+
+void implicitConversionToBoolInSingleExpressions() {
+  int integer = 10;
+  bool boolComingFromInt;
+  boolComingFromInt = integer;
+  // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: implicit conversion 'int' -> 'bool'
+  // CHECK-FIXES: boolComingFromInt = (integer != 0);
+
+  float floating = 10.0f;
+  bool boolComingFromFloat;
+  boolComingFromFloat = floating;
+  // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: implicit conversion 'float' -> 'bool'
+  // CHECK-FIXES: boolComingFromFloat = (floating != 0.0f);
+
+  signed char character = 'a';
+  bool boolComingFromChar;
+  boolComingFromChar = character;
+  // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: implicit conversion 'signed char' -> 'bool'
+  // CHECK-FIXES: boolComingFromChar = (character != 0);
+
+  int* pointer = nullptr;
+  bool boolComingFromPointer;
+  boolComingFromPointer = pointer;
+  // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: implicit conversion 'int *' -> 'bool'
+  // CHECK-FIXES: boolComingFromPointer = (pointer != nullptr);
+}
+
+void implicitConversionToBoolInComplexExpressions() {
+  bool boolean = true;
+
+  int integer = 10;
+  int anotherInteger = 20;
+  bool boolComingFromInteger;
+  boolComingFromInteger = integer + anotherInteger;
+  // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: implicit conversion 'int' -> 'bool'
+  // CHECK-FIXES: boolComingFromInteger = ((integer + anotherInteger) != 0);
+}
+
+void implicitConversionInNegationExpressions() {
+  int integer = 10;
+  bool boolComingFromNegatedInt;
+  boolComingFromNegatedInt = !integer;
+  // CHECK-MESSAGES: :[[@LINE-1]]:30: warning: implicit conversion 'int' -> 'bool'
+  // CHECK-FIXES: boolComingFromNegatedInt = ((!integer) != 0);
+}
+
+bool implicitConversionToBoolInReturnValue() {
+  float floating = 1.0f;
+  return floating;
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: implicit conversion 'float' -> 'bool'
+  // CHECK-FIXES: return floating != 0.0f;
+}
+
+void implicitConversionToBoolFromLiterals() {
+  functionTakingBool(0);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'int' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(false);
+
+  functionTakingBool(1);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'int' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(true);
+
+  functionTakingBool(2ul);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'unsigned long' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(true);
+
+  functionTakingBool(0.0f);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'float' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(false);
+
+  functionTakingBool(1.0f);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'float' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(true);
+
+  functionTakingBool(2.0);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'double' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(true);
+
+  functionTakingBool('\0');
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'int' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(false);
+
+  functionTakingBool('a');
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'int' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(true);
+
+  functionTakingBool("");
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'char *' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(true);
+
+  functionTakingBool("abc");
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'char *' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(true);
+
+  functionTakingBool(NULL);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'long' -> 'bool'
+  // CHECK-FIXES: functionTakingBool(false);
+}
+
+void implicitConversionToBoolFromUnaryMinusAndZeroLiterals() {
+  functionTakingBool(-0);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'int' -> 'bool'
+  // CHECK-FIXES: functionTakingBool((-0) != 0);
+
+  functionTakingBool(-0.0f);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'float' -> 'bool'
+  // CHECK-FIXES: functionTakingBool((-0.0f) != 0.0f);
+
+  functionTakingBool(-0.0);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: implicit conversion 'double' -> 'bool'
+  // CHECK-FIXES: functionTakingBool((-0.0) != 0.0);
+}
+
+void ignoreExplicitCastsToBool() {
+  int integer = 10;
+  bool boolComingFromInt = (bool)integer;
+
+  float floating = 10.0f;
+  bool boolComingFromFloat = (bool)floating;
+
+  char character = 'a';
+  bool boolComingFromChar = (bool)character;
+
+  int* pointer = nullptr;
+  bool booleanComingFromPointer = (bool)pointer;
+}
+
+void ignoreImplicitConversionToBoolInMacroExpansions() {
+  int integer = 3;
+
+  #define CAST_TO_BOOL_IN_MACRO_BODY integer && false
+  bool boolFromMacroBody = CAST_TO_BOOL_IN_MACRO_BODY;
+
+  #define CAST_TO_BOOL_IN_MACRO_ARGUMENT(x) x || true
+  bool boolFromMacroArgument = CAST_TO_BOOL_IN_MACRO_ARGUMENT(integer);
+}
+
+int implicitConversionReturnInt()
+{
+    return true;
+    // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: implicit conversion 'bool' -> 'int'
+    // CHECK-FIXES: return 1
+}
+
+int implicitConversionReturnIntWithParens()
+{
+    return (true);
+    // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: implicit conversion 'bool' -> 'int'
+    // CHECK-FIXES: return 1
+}
+
+bool implicitConversionReturnBool()
+{
+    return 1;
+    // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: implicit conversion 'int' -> 'bool'
+    // CHECK-FIXES: return true
+}
+
+bool implicitConversionReturnBoolWithParens()
+{
+    return (1);
+    // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: implicit conversion 'int' -> 'bool'
+    // CHECK-FIXES: return true
+}
+
+int keepCompactReturnInC_PR71848() {
+  bool foo = false;
+  return( foo );
+// CHECK-MESSAGES: :[[@LINE-1]]:9: warning: implicit conversion 'bool' -> 'int' [readability-implicit-bool-conversion]
+// CHECK-FIXES: return(int)( foo );
+}
diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index c20ce47a12ab..a6bcb853a464 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -349,10 +349,7 @@ if (LLVM_COMPILER_IS_GCC_COMPATIBLE)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pedantic -Wno-long-long")
   endif ()
 
-  check_cxx_compiler_flag("-Werror -Wnested-anon-types" CXX_SUPPORTS_NO_NESTED_ANON_TYPES_FLAG)
-  if( CXX_SUPPORTS_NO_NESTED_ANON_TYPES_FLAG )
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-nested-anon-types" )
-  endif()
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-nested-anon-types" )
 endif ()
 
 # Determine HOST_LINK_VERSION on Darwin.
diff --git a/clang/cmake/caches/CrossWinToARMLinux.cmake b/clang/cmake/caches/CrossWinToARMLinux.cmake
index 736a54ece550..62e87c6c62f8 100644
--- a/clang/cmake/caches/CrossWinToARMLinux.cmake
+++ b/clang/cmake/caches/CrossWinToARMLinux.cmake
@@ -89,6 +89,13 @@ endif()
 
 message(STATUS "Toolchain target to build: ${LLVM_TARGETS_TO_BUILD}")
 
+# Allow to override libc++ ABI version. Use 2 by default.
+if (NOT DEFINED LIBCXX_ABI_VERSION)
+  set(LIBCXX_ABI_VERSION 2)
+endif()
+
+message(STATUS "Toolchain's Libc++ ABI version: ${LIBCXX_ABI_VERSION}")
+
 if (NOT DEFINED CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE "Release" CACHE STRING "")
 endif()
@@ -109,8 +116,15 @@ set(CLANG_DEFAULT_OBJCOPY                   "llvm-objcopy" CACHE STRING "")
 set(CLANG_DEFAULT_RTLIB                     "compiler-rt" CACHE STRING "")
 set(CLANG_DEFAULT_UNWINDLIB                 "libunwind" CACHE STRING "")
 
-if(WIN32)
-  set(CMAKE_MSVC_RUNTIME_LIBRARY            "MultiThreaded" CACHE STRING "")
+if (NOT DEFINED CMAKE_MSVC_RUNTIME_LIBRARY AND WIN32)
+  #Note: Always specify MT DLL for the LLDB build configurations on Windows host.
+  if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(CMAKE_MSVC_RUNTIME_LIBRARY            "MultiThreadedDebugDLL" CACHE STRING "")
+  else()
+    set(CMAKE_MSVC_RUNTIME_LIBRARY            "MultiThreadedDLL" CACHE STRING "")
+  endif()
+  # Grab all ucrt/vcruntime related DLLs into the binary installation folder.
+  set(CMAKE_INSTALL_UCRT_LIBRARIES          ON CACHE BOOL "")
 endif()
 
 # Set up RPATH for the target runtime/builtin libraries.
@@ -127,6 +141,15 @@ set(BUILTINS_${TOOLCHAIN_TARGET_TRIPLE}_CMAKE_INSTALL_RPATH
 set(BUILTINS_${TOOLCHAIN_TARGET_TRIPLE}_CMAKE_BUILD_WITH_INSTALL_RPATH            ON  CACHE BOOL "")
 set(BUILTINS_${TOOLCHAIN_TARGET_TRIPLE}_LLVM_CMAKE_DIR                            "${LLVM_PROJECT_DIR}/llvm/cmake/modules" CACHE PATH "")
 
+if (DEFINED TOOLCHAIN_TARGET_COMPILER_FLAGS)
+  foreach(lang C;CXX;ASM)
+    set(BUILTINS_${TOOLCHAIN_TARGET_TRIPLE}_CMAKE_${lang}_FLAGS         "${TOOLCHAIN_TARGET_COMPILER_FLAGS}" CACHE STRING "")
+  endforeach()
+endif()
+foreach(type SHARED;MODULE;EXE)
+  set(BUILTINS_${TOOLCHAIN_TARGET_TRIPLE}_CMAKE_${type}_LINKER_FLAGS    "-fuse-ld=lld" CACHE STRING "")
+endforeach()
+
 set(LLVM_RUNTIME_TARGETS                    "${TOOLCHAIN_TARGET_TRIPLE}" CACHE STRING "")
 set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR      ON CACHE BOOL "")
 
@@ -137,6 +160,15 @@ set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_CMAKE_SYSROOT
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_CMAKE_INSTALL_RPATH                       "${RUNTIMES_INSTALL_RPATH}"  CACHE STRING "")
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_CMAKE_BUILD_WITH_INSTALL_RPATH            ON  CACHE BOOL "")
 
+if (DEFINED TOOLCHAIN_TARGET_COMPILER_FLAGS)
+  foreach(lang C;CXX;ASM)
+    set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_CMAKE_${lang}_FLAGS         "${TOOLCHAIN_TARGET_COMPILER_FLAGS}" CACHE STRING "")
+  endforeach()
+endif()
+foreach(type SHARED;MODULE;EXE)
+  set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_CMAKE_${type}_LINKER_FLAGS    "-fuse-ld=lld" CACHE STRING "")
+endforeach()
+
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_BUILD_BUILTINS                ON CACHE BOOL "")
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_BUILD_SANITIZERS              OFF CACHE BOOL "")
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_COMPILER_RT_BUILD_XRAY                    OFF CACHE BOOL "")
@@ -164,7 +196,7 @@ set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXXABI_ENABLE_SHARED
 
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_USE_COMPILER_RT                    ON CACHE BOOL "")
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_ENABLE_SHARED                      OFF CACHE BOOL "")
-set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_ABI_VERSION                        2 CACHE STRING "")
+set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_ABI_VERSION                        ${LIBCXX_ABI_VERSION} CACHE STRING "")
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_CXX_ABI                            "libcxxabi" CACHE STRING "")    #!!!
 set(RUNTIMES_${TOOLCHAIN_TARGET_TRIPLE}_LIBCXX_ENABLE_NEW_DELETE_DEFINITIONS      ON CACHE BOOL "")
 
diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake
index d5546e20873b..66e764968e85 100644
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -19,7 +19,6 @@ set(LLVM_ENABLE_LLD ON CACHE BOOL "")
 set(LLVM_ENABLE_LTO ON CACHE BOOL "")
 set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "")
 set(LLVM_ENABLE_PLUGINS OFF CACHE BOOL "")
-set(LLVM_ENABLE_TERMINFO OFF CACHE BOOL "")
 set(LLVM_ENABLE_UNWIND_TABLES OFF CACHE BOOL "")
 set(LLVM_ENABLE_Z3_SOLVER OFF CACHE BOOL "")
 set(LLVM_ENABLE_ZLIB ON CACHE BOOL "")
diff --git a/clang/cmake/caches/Fuchsia.cmake b/clang/cmake/caches/Fuchsia.cmake
index 30a3b9116a46..4d3af3ad3f40 100644
--- a/clang/cmake/caches/Fuchsia.cmake
+++ b/clang/cmake/caches/Fuchsia.cmake
@@ -12,7 +12,6 @@ set(LLVM_ENABLE_DIA_SDK OFF CACHE BOOL "")
 set(LLVM_ENABLE_LIBEDIT OFF CACHE BOOL "")
 set(LLVM_ENABLE_LIBXML2 OFF CACHE BOOL "")
 set(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR ON CACHE BOOL "")
-set(LLVM_ENABLE_TERMINFO OFF CACHE BOOL "")
 set(LLVM_ENABLE_UNWIND_TABLES OFF CACHE BOOL "")
 set(LLVM_ENABLE_Z3_SOLVER OFF CACHE BOOL "")
 set(LLVM_ENABLE_ZLIB OFF CACHE BOOL "")
@@ -34,7 +33,6 @@ set(_FUCHSIA_BOOTSTRAP_PASSTHROUGH
   LibXml2_ROOT
   LLVM_ENABLE_CURL
   LLVM_ENABLE_HTTPLIB
-  LLVM_ENABLE_TERMINFO
   LLVM_ENABLE_LIBEDIT
   CURL_ROOT
   OpenSSL_ROOT
@@ -48,11 +46,6 @@ set(_FUCHSIA_BOOTSTRAP_PASSTHROUGH
   PANEL_LIBRARIES
 
   # Deprecated
-  Terminfo_ROOT
-
-  Terminfo_LIBRARIES
-
-  # Deprecated
   LibEdit_ROOT
 
   LibEdit_INCLUDE_DIRS
diff --git a/clang/cmake/caches/VectorEngine.cmake b/clang/cmake/caches/VectorEngine.cmake
index 2f968a21cc40..b429fb0997d7 100644
--- a/clang/cmake/caches/VectorEngine.cmake
+++ b/clang/cmake/caches/VectorEngine.cmake
@@ -13,9 +13,7 @@
 #   ninja
 #
 
-# Disable TERMINFO, ZLIB, and ZSTD for VE since there is no pre-compiled
-# libraries.
-set(LLVM_ENABLE_TERMINFO OFF CACHE BOOL "")
+# Disable ZLIB, and ZSTD for VE since there is no pre-compiled libraries.
 set(LLVM_ENABLE_ZLIB OFF CACHE BOOL "")
 set(LLVM_ENABLE_ZSTD OFF CACHE BOOL "")
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 81e9d0423f96..d023f53754cb 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -59,6 +59,18 @@ C++ Specific Potentially Breaking Changes
 - Clang now performs semantic analysis for unary operators with dependent operands
   that are known to be of non-class non-enumeration type prior to instantiation.
 
+  This change uncovered a bug in libstdc++ 14.1.0 which may cause compile failures
+  on systems using that version of libstdc++ and Clang 19, with an error that looks
+  something like this:
+
+  .. code-block:: text
+
+    <source>:4:5: error: expression is not assignable
+    4 |     ++this;
+      |     ^ ~~~~
+
+  To fix this, update libstdc++ to version 14.1.1 or greater.
+
 ABI Changes in This Version
 ---------------------------
 - Fixed Microsoft name mangling of implicitly defined variables used for thread
@@ -155,6 +167,11 @@ C++17 Feature Support
   files because they may not be stable across multiple TUs (the values may vary
   based on compiler version as well as CPU tuning). #GH60174
 
+C++14 Feature Support
+^^^^^^^^^^^^^^^^^^^^^
+- Sized deallocation is enabled by default in C++14 onwards. The user may specify
+  ``-fno-sized-deallocation`` to disable it if there are some regressions.
+
 C++20 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
@@ -317,13 +334,18 @@ New Compiler Flags
 
 - ``-fexperimental-late-parse-attributes`` enables an experimental feature to
   allow late parsing certain attributes in specific contexts where they would
-  not normally be late parsed.
+  not normally be late parsed. Currently this allows late parsing the
+  `counted_by` attribute in C. See `Attribute Changes in Clang`_.
 
 - ``-fseparate-named-sections`` uses separate unique sections for global
   symbols in named special sections (i.e. symbols annotated with
   ``__attribute__((section(...)))``. This enables linker GC to collect unused
   symbols without having to use a per-symbol section.
 
+- ``-fms-define-stdc`` and its clang-cl counterpart ``/Zc:__STDC__``.
+  Matches MSVC behaviour by defining ``__STDC__`` to ``1`` when
+  MSVC compatibility mode is used. It has no effect for C++ code.
+
 Deprecated Compiler Flags
 -------------------------
 
@@ -406,6 +428,24 @@ Attribute Changes in Clang
 - The ``clspv_libclc_builtin`` attribute has been added to allow clspv
   (`OpenCL-C to Vulkan SPIR-V compiler <https://github.com/google/clspv>`_) to identify functions coming from libclc
   (`OpenCL-C builtin library <https://libclc.llvm.org>`_).
+- The ``counted_by`` attribute is now allowed on pointers that are members of a
+  struct in C.
+
+- The ``counted_by`` attribute can now be late parsed in C when
+  ``-fexperimental-late-parse-attributes`` is passed but only when attribute is
+  used in the declaration attribute position. This allows using the
+  attribute on existing code where it previously impossible to do so without
+  re-ordering struct field declarations would break ABI as shown below.
+
+  .. code-block:: c
+
+     struct BufferTy {
+       /* Refering to `count` requires late parsing */
+       char* buffer __counted_by(count);
+       /* Swapping `buffer` and `count` to avoid late parsing would break ABI */
+       size_t count;
+     };
+
 
 Improvements to Clang's diagnostics
 -----------------------------------
@@ -749,6 +789,11 @@ Bug Fixes to C++ Support
 - Clang now correctly diagnoses when the current instantiation is used as an incomplete base class.
 - Clang no longer treats ``constexpr`` class scope function template specializations of non-static members
   as implicitly ``const`` in language modes after C++11.
+- Fixed a crash when trying to emit captures in a lambda call operator with an explicit object
+  parameter that is called on a derived type of the lambda.
+  Fixes (#GH87210), (GH89541).
+- Clang no longer tries to check if an expression is immediate-escalating in an unevaluated context.
+  Fixes (#GH91308).
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -761,12 +806,15 @@ Miscellaneous Bug Fixes
 
 - Fixed an infinite recursion in ASTImporter, on return type declared inside
   body of C++11 lambda without trailing return (#GH68775).
+- Fixed declaration name source location of instantiated function definitions (GH71161).
+- Improve diagnostic output to print an expression instead of 'no argument` when comparing Values as template arguments.
 
 Miscellaneous Clang Crashes Fixed
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 - Do not attempt to dump the layout of dependent types or invalid declarations
   when ``-fdump-record-layouts-complete`` is passed. Fixes #GH83684.
+- Unhandled StructuralValues in the template differ (#GH93068).
 
 OpenACC Specific Changes
 ------------------------
@@ -780,6 +828,8 @@ AMDGPU Support
 X86 Support
 ^^^^^^^^^^^
 
+- Remove knl/knm specific ISA supports: AVX512PF, AVX512ER, PREFETCHWT1
+
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -832,6 +882,10 @@ Windows Support
   including STL headers will no longer slow down compile times since ``intrin.h``
   is not included from MSVC STL.
 
+- When the target triple is `*-windows-msvc` strict aliasing is now disabled by default
+  to ensure compatibility with msvc. Previously strict aliasing was only disabled if the
+  driver mode was cl.
+
 LoongArch Support
 ^^^^^^^^^^^^^^^^^
 
diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index eb8b58323da4..ac9f0b06f63b 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -1179,6 +1179,47 @@ security.insecureAPI.DeprecatedOrUnsafeBufferHandling (C)
    strncpy(buf, "a", 1); // warn
  }
 
+security.SetgidSetuidOrder (C)
+""""""""""""""""""""""""""""""
+When dropping user-level and group-level privileges in a program by using
+``setuid`` and ``setgid`` calls, it is important to reset the group-level
+privileges (with ``setgid``) first. Function ``setgid`` will likely fail if
+the superuser privileges are already dropped.
+
+The checker checks for sequences of ``setuid(getuid())`` and
+``setgid(getgid())`` calls (in this order). If such a sequence is found and
+there is no other privilege-changing function call (``seteuid``, ``setreuid``,
+``setresuid`` and the GID versions of these) in between, a warning is
+generated. The checker finds only exactly ``setuid(getuid())`` calls (and the
+GID versions), not for example if the result of ``getuid()`` is stored in a
+variable.
+
+.. code-block:: c
+
+ void test1() {
+   // ...
+   // end of section with elevated privileges
+   // reset privileges (user and group) to normal user
+   if (setuid(getuid()) != 0) {
+     handle_error();
+     return;
+   }
+   if (setgid(getgid()) != 0) { // warning: A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail
+     handle_error();
+     return;
+   }
+   // user-ID and group-ID are reset to normal user now
+   // ...
+ }
+
+In the code above the problem is that ``setuid(getuid())`` removes superuser
+privileges before ``setgid(getgid())`` is called. To fix the problem the
+``setgid(getgid())`` should be called first. Further attention is needed to
+avoid code like ``setgid(getuid())`` (this checker does not detect bugs like
+this) and always check the return value of these calls.
+
+This check corresponds to SEI CERT Rule `POS36-C <https://wiki.sei.cmu.edu/confluence/display/c/POS36-C.+Observe+correct+revocation+order+while+relinquishing+privileges>`_.
+
 .. _unix-checkers:
 
 unix
@@ -2792,6 +2833,31 @@ Warn on mmap() calls that are both writable and executable.
    //       code
  }
 
+.. _alpha-security-putenv-stack-array:
+
+alpha.security.PutenvStackArray (C)
+"""""""""""""""""""""""""""""""""""
+Finds calls to the ``putenv`` function which pass a pointer to a stack-allocated
+(automatic) array as the argument. Function ``putenv`` does not copy the passed
+string, only a pointer to the data is stored and this data can be read even by
+other threads. Content of a stack-allocated array is likely to be overwritten
+after returning from the parent function.
+
+The problem can be solved by using a static array variable or dynamically
+allocated memory. Even better is to avoid using ``putenv`` (it has other
+problems related to memory leaks) and use ``setenv`` instead.
+
+The check corresponds to CERT rule
+`POS34-C. Do not call putenv() with a pointer to an automatic variable as the argument
+<https://wiki.sei.cmu.edu/confluence/display/c/POS34-C.+Do+not+call+putenv%28%29+with+a+pointer+to+an+automatic+variable+as+the+argument>`_.
+
+.. code-block:: c
+
+  int f() {
+    char env[] = "NAME=value";
+    return putenv(env); // putenv function should not be called with stack-allocated string
+  }
+
 .. _alpha-security-ReturnPtrRange:
 
 alpha.security.ReturnPtrRange (C)
@@ -2818,55 +2884,6 @@ alpha.security.cert
 
 SEI CERT checkers which tries to find errors based on their `C coding rules <https://wiki.sei.cmu.edu/confluence/display/c/2+Rules>`_.
 
-.. _alpha-security-cert-pos-checkers:
-
-alpha.security.cert.pos
-^^^^^^^^^^^^^^^^^^^^^^^
-
-SEI CERT checkers of `POSIX C coding rules <https://wiki.sei.cmu.edu/confluence/pages/viewpage.action?pageId=87152405>`_.
-
-.. _alpha-security-cert-pos-34c:
-
-alpha.security.cert.pos.34c
-"""""""""""""""""""""""""""
-Finds calls to the ``putenv`` function which pass a pointer to an automatic variable as the argument.
-
-.. code-block:: c
-
-  int func(const char *var) {
-    char env[1024];
-    int retval = snprintf(env, sizeof(env),"TEST=%s", var);
-    if (retval < 0 || (size_t)retval >= sizeof(env)) {
-        /* Handle error */
-    }
-
-    return putenv(env); // putenv function should not be called with auto variables
-  }
-
-Limitations:
-
-   - Technically, one can pass automatic variables to ``putenv``,
-     but one needs to ensure that the given environment key stays
-     alive until it's removed or overwritten.
-     Since the analyzer cannot keep track of which envvars get overwritten
-     and when, it needs to be slightly more aggressive and warn for such
-     cases too, leading in some cases to false-positive reports like this:
-
-     .. code-block:: c
-
-        void baz() {
-          char env[] = "NAME=value";
-          putenv(env); // false-positive warning: putenv function should not be called...
-          // More code...
-          putenv((char *)"NAME=anothervalue");
-          // This putenv call overwrites the previous entry, thus that can no longer dangle.
-        } // 'env' array becomes dead only here.
-
-alpha.security.cert.env
-^^^^^^^^^^^^^^^^^^^^^^^
-
-SEI CERT checkers of `Environment C coding rules <https://wiki.sei.cmu.edu/confluence/x/JdcxBQ>`_.
-
 alpha.security.taint
 ^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 2ce2b810d363..a1d1d1c51cd4 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -110,6 +110,9 @@ class VarTemplateDecl;
 class VTableContextBase;
 class XRayFunctionFilter;
 
+/// A simple array of base specifiers.
+typedef SmallVector<CXXBaseSpecifier *, 4> CXXCastPath;
+
 namespace Builtin {
 
 class Context;
@@ -1170,6 +1173,12 @@ public:
   /// in device compilation.
   llvm::DenseSet<const FunctionDecl *> CUDAImplicitHostDeviceFunUsedByDevice;
 
+  /// For capturing lambdas with an explicit object parameter whose type is
+  /// derived from the lambda type, we need to perform derived-to-base
+  /// conversion so we can access the captures; the cast paths for that
+  /// are stored here.
+  llvm::DenseMap<const CXXMethodDecl *, CXXCastPath> LambdaCastPaths;
+
   ASTContext(LangOptions &LOpts, SourceManager &SM, IdentifierTable &idents,
              SelectorTable &sels, Builtin::Context &builtins,
              TranslationUnitKind TUKind);
diff --git a/clang/include/clang/AST/ASTNodeTraverser.h b/clang/include/clang/AST/ASTNodeTraverser.h
index bf7c204e4ad7..616f92691ec3 100644
--- a/clang/include/clang/AST/ASTNodeTraverser.h
+++ b/clang/include/clang/AST/ASTNodeTraverser.h
@@ -695,7 +695,7 @@ public:
     if (const auto *TC = D->getTypeConstraint())
       Visit(TC->getImmediatelyDeclaredConstraint());
     if (D->hasDefaultArgument())
-      Visit(D->getDefaultArgument(), SourceRange(),
+      Visit(D->getDefaultArgument().getArgument(), SourceRange(),
             D->getDefaultArgStorage().getInheritedFrom(),
             D->defaultArgumentWasInherited() ? "inherited from" : "previous");
   }
@@ -704,9 +704,9 @@ public:
     if (const auto *E = D->getPlaceholderTypeConstraint())
       Visit(E);
     if (D->hasDefaultArgument())
-      Visit(D->getDefaultArgument(), SourceRange(),
-            D->getDefaultArgStorage().getInheritedFrom(),
-            D->defaultArgumentWasInherited() ? "inherited from" : "previous");
+      dumpTemplateArgumentLoc(
+          D->getDefaultArgument(), D->getDefaultArgStorage().getInheritedFrom(),
+          D->defaultArgumentWasInherited() ? "inherited from" : "previous");
   }
 
   void VisitTemplateTemplateParmDecl(const TemplateTemplateParmDecl *D) {
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index 5e485ccb85a1..7fd80b90d103 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -2188,6 +2188,8 @@ public:
 
   void setRangeEnd(SourceLocation E) { EndRangeLoc = E; }
 
+  void setDeclarationNameLoc(DeclarationNameLoc L) { DNLoc = L; }
+
   /// Returns the location of the ellipsis of a variadic function.
   SourceLocation getEllipsisLoc() const {
     const auto *FPT = getType()->getAs<FunctionProtoType>();
diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h
index f3d6a321ecf1..5b6a6b40b28e 100644
--- a/clang/include/clang/AST/DeclTemplate.h
+++ b/clang/include/clang/AST/DeclTemplate.h
@@ -1185,7 +1185,7 @@ class TemplateTypeParmDecl final : public TypeDecl,
 
   /// The default template argument, if any.
   using DefArgStorage =
-      DefaultArgStorage<TemplateTypeParmDecl, TypeSourceInfo *>;
+      DefaultArgStorage<TemplateTypeParmDecl, TemplateArgumentLoc *>;
   DefArgStorage DefaultArgument;
 
   TemplateTypeParmDecl(DeclContext *DC, SourceLocation KeyLoc,
@@ -1225,13 +1225,9 @@ public:
   bool hasDefaultArgument() const { return DefaultArgument.isSet(); }
 
   /// Retrieve the default argument, if any.
-  QualType getDefaultArgument() const {
-    return DefaultArgument.get()->getType();
-  }
-
-  /// Retrieves the default argument's source information, if any.
-  TypeSourceInfo *getDefaultArgumentInfo() const {
-    return DefaultArgument.get();
+  const TemplateArgumentLoc &getDefaultArgument() const {
+    static const TemplateArgumentLoc NoneLoc;
+    return DefaultArgument.isSet() ? *DefaultArgument.get() : NoneLoc;
   }
 
   /// Retrieves the location of the default argument declaration.
@@ -1244,9 +1240,8 @@ public:
   }
 
   /// Set the default argument for this template parameter.
-  void setDefaultArgument(TypeSourceInfo *DefArg) {
-    DefaultArgument.set(DefArg);
-  }
+  void setDefaultArgument(const ASTContext &C,
+                          const TemplateArgumentLoc &DefArg);
 
   /// Set that this default argument was inherited from another
   /// parameter.
@@ -1365,7 +1360,8 @@ class NonTypeTemplateParmDecl final
 
   /// The default template argument, if any, and whether or not
   /// it was inherited.
-  using DefArgStorage = DefaultArgStorage<NonTypeTemplateParmDecl, Expr *>;
+  using DefArgStorage =
+      DefaultArgStorage<NonTypeTemplateParmDecl, TemplateArgumentLoc *>;
   DefArgStorage DefaultArgument;
 
   // FIXME: Collapse this into TemplateParamPosition; or, just move depth/index
@@ -1435,7 +1431,10 @@ public:
   bool hasDefaultArgument() const { return DefaultArgument.isSet(); }
 
   /// Retrieve the default argument, if any.
-  Expr *getDefaultArgument() const { return DefaultArgument.get(); }
+  const TemplateArgumentLoc &getDefaultArgument() const {
+    static const TemplateArgumentLoc NoneLoc;
+    return DefaultArgument.isSet() ? *DefaultArgument.get() : NoneLoc;
+  }
 
   /// Retrieve the location of the default argument, if any.
   SourceLocation getDefaultArgumentLoc() const;
@@ -1449,7 +1448,8 @@ public:
   /// Set the default argument for this template parameter, and
   /// whether that default argument was inherited from another
   /// declaration.
-  void setDefaultArgument(Expr *DefArg) { DefaultArgument.set(DefArg); }
+  void setDefaultArgument(const ASTContext &C,
+                          const TemplateArgumentLoc &DefArg);
   void setInheritedDefaultArgument(const ASTContext &C,
                                    NonTypeTemplateParmDecl *Parm) {
     DefaultArgument.setInherited(C, Parm);
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index f5cefedb07e0..4bbb4380cdd7 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -30,6 +30,7 @@
 #include "clang/AST/ExprOpenMP.h"
 #include "clang/AST/LambdaCapture.h"
 #include "clang/AST/NestedNameSpecifier.h"
+#include "clang/AST/OpenACCClause.h"
 #include "clang/AST/OpenMPClause.h"
 #include "clang/AST/Stmt.h"
 #include "clang/AST/StmtCXX.h"
@@ -510,6 +511,7 @@ private:
   bool
   TraverseOpenACCAssociatedStmtConstruct(OpenACCAssociatedStmtConstruct *S);
   bool VisitOpenACCClauseList(ArrayRef<const OpenACCClause *>);
+  bool VisitOpenACCClause(const OpenACCClause *);
 };
 
 template <typename Derived>
@@ -1960,7 +1962,7 @@ DEF_TRAVERSE_DECL(TemplateTypeParmDecl, {
     TRY_TO(TraverseType(QualType(D->getTypeForDecl(), 0)));
   TRY_TO(TraverseTemplateTypeParamDeclConstraints(D));
   if (D->hasDefaultArgument() && !D->defaultArgumentWasInherited())
-    TRY_TO(TraverseTypeLoc(D->getDefaultArgumentInfo()->getTypeLoc()));
+    TRY_TO(TraverseTemplateArgumentLoc(D->getDefaultArgument()));
 })
 
 DEF_TRAVERSE_DECL(TypedefDecl, {
@@ -2320,7 +2322,7 @@ DEF_TRAVERSE_DECL(NonTypeTemplateParmDecl, {
   // A non-type template parameter, e.g. "S" in template<int S> class Foo ...
   TRY_TO(TraverseDeclaratorHelper(D));
   if (D->hasDefaultArgument() && !D->defaultArgumentWasInherited())
-    TRY_TO(TraverseStmt(D->getDefaultArgument()));
+    TRY_TO(TraverseTemplateArgumentLoc(D->getDefaultArgument()));
 })
 
 DEF_TRAVERSE_DECL(ParmVarDecl, {
@@ -3968,8 +3970,25 @@ bool RecursiveASTVisitor<Derived>::TraverseOpenACCAssociatedStmtConstruct(
 }
 
 template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOpenACCClause(const OpenACCClause *C) {
+  for (const Stmt *Child : C->children())
+    TRY_TO(TraverseStmt(const_cast<Stmt *>(Child)));
+  return true;
+}
+
+template <typename Derived>
 bool RecursiveASTVisitor<Derived>::VisitOpenACCClauseList(
-    ArrayRef<const OpenACCClause *>) {
+    ArrayRef<const OpenACCClause *> Clauses) {
+
+  for (const auto *C : Clauses)
+    TRY_TO(VisitOpenACCClause(C));
+//    if (const auto *WithCond = dyn_cast<OopenACCClauseWithCondition>(C);
+//        WithCond && WIthCond->hasConditionExpr()) {
+//      TRY_TO(TraverseStmt(WithCond->getConditionExpr());
+//    } else if (const auto *
+//  }
+//  OpenACCClauseWithCondition::getConditionExpr/hasConditionExpr
+//OpenACCClauseWithExprs::children (might be null?)
   // TODO OpenACC: When we have Clauses with expressions, we should visit them
   // here.
   return true;
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index da3834f19ca0..263b632df23c 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2515,6 +2515,7 @@ public:
   bool isRecordType() const;
   bool isClassType() const;
   bool isStructureType() const;
+  bool isStructureTypeWithFlexibleArrayMember() const;
   bool isObjCBoxableRecordType() const;
   bool isInterfaceType() const;
   bool isStructureOrClassType() const;
@@ -2523,6 +2524,7 @@ public:
   bool isVectorType() const;                    // GCC vector type.
   bool isExtVectorType() const;                 // Extended vector type.
   bool isExtVectorBoolType() const;             // Extended vector type with bool element.
+  bool isSubscriptableVectorType() const;
   bool isMatrixType() const;                    // Matrix type.
   bool isConstantMatrixType() const;            // Constant matrix type.
   bool isDependentAddressSpaceType() const;     // value-dependent address space qualifier
@@ -7729,6 +7731,10 @@ inline bool Type::isExtVectorBoolType() const {
   return cast<ExtVectorType>(CanonicalType)->getElementType()->isBooleanType();
 }
 
+inline bool Type::isSubscriptableVectorType() const {
+  return isVectorType() || isSveVLSBuiltinType();
+}
+
 inline bool Type::isMatrixType() const {
   return isa<MatrixType>(CanonicalType);
 }
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 7008bea483c8..e59cccccdd36 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1640,10 +1640,11 @@ def Unlikely : StmtAttr {
 def : MutualExclusions<[Likely, Unlikely]>;
 
 def CXXAssume : StmtAttr {
-  let Spellings = [CXX11<"", "assume", 202207>];
+  let Spellings = [CXX11<"", "assume", 202207>, Clang<"assume">];
   let Subjects = SubjectList<[NullStmt], ErrorDiag, "empty statements">;
   let Args = [ExprArgument<"Assumption">];
   let Documentation = [CXXAssumeDocs];
+  let HasCustomParsing = 1;
 }
 
 def NoMerge : DeclOrStmtAttr {
@@ -2256,7 +2257,8 @@ def TypeNullUnspecified : TypeAttr {
 def CountedBy : DeclOrTypeAttr {
   let Spellings = [Clang<"counted_by">];
   let Subjects = SubjectList<[Field], ErrorDiag>;
-  let Args = [ExprArgument<"Count">, IntArgument<"NestedLevel">];
+  let Args = [ExprArgument<"Count">, IntArgument<"NestedLevel", 1>];
+  let LateParsed = LateAttrParseExperimentalExt;
   let ParseArgumentsAsUnevaluated = 1;
   let Documentation = [CountedByDocs];
   let LangOpts = [COnly];
@@ -4255,7 +4257,7 @@ def OMPDeclareVariant : InheritableAttr {
 }
 
 def OMPAssume : InheritableAttr {
-  let Spellings = [Clang<"assume">, CXX11<"omp", "assume">];
+  let Spellings = [CXX11<"omp", "assume">];
   let Subjects = SubjectList<[Function, ObjCMethod]>;
   let InheritEvenIfAlreadyPresent = 1;
   let Documentation = [OMPAssumeDocs];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 54197d588eb4..a313e811c9d2 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2027,9 +2027,6 @@ Different optimisers are likely to react differently to the presence of
 this attribute; in some cases, adding ``assume`` may affect performance
 negatively. It should be used with parsimony and care.
 
-Note that `clang::assume` is a different attribute. Always write ``assume``
-without a namespace if you intend to use the standard C++ attribute.
-
 Example:
 
 .. code-block:: c++
@@ -4740,7 +4737,7 @@ def OMPAssumeDocs : Documentation {
   let Category = DocCatFunction;
   let Heading = "assume";
   let Content = [{
-Clang supports the ``__attribute__((assume("assumption")))`` attribute to
+Clang supports the ``[[omp::assume("assumption")]]`` attribute to
 provide additional information to the optimizer. The string-literal, here
 "assumption", will be attached to the function declaration such that later
 analysis and optimization passes can assume the "assumption" to hold.
@@ -4752,7 +4749,7 @@ A function can have multiple assume attributes and they propagate from prior
 declarations to later definitions. Multiple assumptions are aggregated into a
 single comma separated string. Thus, one can provide multiple assumptions via
 a comma separated string, i.a.,
-``__attribute__((assume("assumption1,assumption2")))``.
+``[[omp::assume("assumption1,assumption2")]]``.
 
 While LLVM plugins might provide more assumption strings, the default LLVM
 optimization passes are aware of the following assumptions:
diff --git a/clang/include/clang/Basic/BuiltinsAArch64.def b/clang/include/clang/Basic/BuiltinsAArch64.def
index cf8711c6eaee..5f53c98167df 100644
--- a/clang/include/clang/Basic/BuiltinsAArch64.def
+++ b/clang/include/clang/Basic/BuiltinsAArch64.def
@@ -290,7 +290,7 @@ TARGET_HEADER_BUILTIN(_CountLeadingZeros64, "UiULLi", "nh", INTRIN_H, ALL_MS_LAN
 TARGET_HEADER_BUILTIN(_CountOneBits, "UiUNi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
 TARGET_HEADER_BUILTIN(_CountOneBits64, "UiULLi", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
 
-TARGET_HEADER_BUILTIN(__prefetch, "vv*", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
+TARGET_HEADER_BUILTIN(__prefetch, "vvC*", "nh", INTRIN_H, ALL_MS_LANGUAGES, "")
 
 #undef BUILTIN
 #undef LANGBUILTIN
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 3e21a2fe2ac6..efa652eee990 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -240,6 +240,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "at
 TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst")
 TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts")
 TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3UiiUi", "t", "gfx940-insts")
 
 //===----------------------------------------------------------------------===//
 // Deep learning builtins.
diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def
index 8645cff1e867..fd8c1b480d6d 100644
--- a/clang/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@@ -193,6 +193,8 @@ TARGET_BUILTIN(__builtin_wasm_relaxed_dot_bf16x8_add_f32_f32x4, "V4fV8UsV8UsV4f"
 // Half-Precision (fp16)
 TARGET_BUILTIN(__builtin_wasm_loadf16_f32, "fh*", "nU", "half-precision")
 TARGET_BUILTIN(__builtin_wasm_storef16_f32, "vfh*", "n", "half-precision")
+TARGET_BUILTIN(__builtin_wasm_splat_f16x8, "V8hf", "nc", "half-precision")
+TARGET_BUILTIN(__builtin_wasm_extract_lane_f16x8, "fV8hi", "nc", "half-precision")
 
 // Reference Types builtins
 // Some builtins are custom type-checked - see 't' as part of the third argument,
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index eafcc219c109..7074479786b9 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -832,23 +832,11 @@ TARGET_BUILTIN(__builtin_ia32_rsqrt14ss_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx
 TARGET_BUILTIN(__builtin_ia32_rsqrt14pd512_mask, "V8dV8dV8dUc", "ncV:512:", "avx512f,evex512")
 TARGET_BUILTIN(__builtin_ia32_rsqrt14ps512_mask, "V16fV16fV16fUs", "ncV:512:", "avx512f,evex512")
 
-TARGET_BUILTIN(__builtin_ia32_rsqrt28sd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512er")
-TARGET_BUILTIN(__builtin_ia32_rsqrt28ss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512er")
-TARGET_BUILTIN(__builtin_ia32_rsqrt28pd_mask, "V8dV8dV8dUcIi", "ncV:512:", "avx512er,evex512")
-TARGET_BUILTIN(__builtin_ia32_rsqrt28ps_mask, "V16fV16fV16fUsIi", "ncV:512:", "avx512er,evex512")
-
 TARGET_BUILTIN(__builtin_ia32_rcp14sd_mask, "V2dV2dV2dV2dUc", "ncV:128:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_rcp14ss_mask, "V4fV4fV4fV4fUc", "ncV:128:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_rcp14pd512_mask, "V8dV8dV8dUc", "ncV:512:", "avx512f,evex512")
 TARGET_BUILTIN(__builtin_ia32_rcp14ps512_mask, "V16fV16fV16fUs", "ncV:512:", "avx512f,evex512")
 
-TARGET_BUILTIN(__builtin_ia32_rcp28sd_round_mask, "V2dV2dV2dV2dUcIi", "ncV:128:", "avx512er")
-TARGET_BUILTIN(__builtin_ia32_rcp28ss_round_mask, "V4fV4fV4fV4fUcIi", "ncV:128:", "avx512er")
-TARGET_BUILTIN(__builtin_ia32_rcp28pd_mask, "V8dV8dV8dUcIi", "ncV:512:", "avx512er,evex512")
-TARGET_BUILTIN(__builtin_ia32_rcp28ps_mask, "V16fV16fV16fUsIi", "ncV:512:", "avx512er,evex512")
-TARGET_BUILTIN(__builtin_ia32_exp2pd_mask, "V8dV8dV8dUcIi", "ncV:512:", "avx512er,evex512")
-TARGET_BUILTIN(__builtin_ia32_exp2ps_mask, "V16fV16fV16fUsIi", "ncV:512:", "avx512er,evex512")
-
 TARGET_BUILTIN(__builtin_ia32_cvttps2dq512_mask, "V16iV16fV16iUsIi", "ncV:512:", "avx512f,evex512")
 TARGET_BUILTIN(__builtin_ia32_cvttps2udq512_mask, "V16iV16fV16iUsIi", "ncV:512:", "avx512f,evex512")
 TARGET_BUILTIN(__builtin_ia32_cvttpd2dq512_mask, "V8iV8dV8iUcIi", "ncV:512:", "avx512f,evex512")
@@ -960,15 +948,6 @@ TARGET_BUILTIN(__builtin_ia32_scattersiv16si, "vv*UsV16iV16iIi", "nV:512:", "avx
 TARGET_BUILTIN(__builtin_ia32_scatterdiv8di,  "vv*UcV8OiV8OiIi", "nV:512:", "avx512f,evex512")
 TARGET_BUILTIN(__builtin_ia32_scatterdiv16si, "vv*UcV8OiV8iIi", "nV:512:", "avx512f,evex512")
 
-TARGET_BUILTIN(__builtin_ia32_gatherpfdpd,  "vUcV8ivC*IiIi", "nV:512:", "avx512pf,evex512")
-TARGET_BUILTIN(__builtin_ia32_gatherpfdps,  "vUsV16ivC*IiIi", "nV:512:", "avx512pf,evex512")
-TARGET_BUILTIN(__builtin_ia32_gatherpfqpd,  "vUcV8OivC*IiIi", "nV:512:", "avx512pf,evex512")
-TARGET_BUILTIN(__builtin_ia32_gatherpfqps,  "vUcV8OivC*IiIi", "nV:512:", "avx512pf,evex512")
-TARGET_BUILTIN(__builtin_ia32_scatterpfdpd, "vUcV8iv*IiIi", "nV:512:", "avx512pf,evex512")
-TARGET_BUILTIN(__builtin_ia32_scatterpfdps, "vUsV16iv*IiIi", "nV:512:", "avx512pf,evex512")
-TARGET_BUILTIN(__builtin_ia32_scatterpfqpd, "vUcV8Oiv*IiIi", "nV:512:", "avx512pf,evex512")
-TARGET_BUILTIN(__builtin_ia32_scatterpfqps, "vUcV8Oiv*IiIi", "nV:512:", "avx512pf,evex512")
-
 TARGET_BUILTIN(__builtin_ia32_knotqi, "UcUc", "nc", "avx512dq")
 TARGET_BUILTIN(__builtin_ia32_knothi, "UsUs", "nc", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_knotsi, "UiUi", "nc", "avx512bw")
diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index 0738f43ca555..1e44bc4ad09b 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -361,9 +361,6 @@ def warn_invalid_feature_combination : Warning<
 def warn_target_unrecognized_env : Warning<
   "mismatch between architecture and environment in target triple '%0'; did you mean '%1'?">,
   InGroup<InvalidCommandLineArgument>;
-def warn_knl_knm_isa_support_removed : Warning<
-  "KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.">,
-  InGroup<DiagGroup<"knl-knm-isa-support-removed">>;
 def err_target_unsupported_abi_with_fpu : Error<
   "'%0' ABI is not supported with FPU">;
 
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 9d97a75f696f..773b234cd68f 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -58,7 +58,7 @@ def warn_drv_avr_stdlib_not_linked: Warning<
 def err_drv_cuda_bad_gpu_arch : Error<"unsupported CUDA gpu architecture: %0">;
 def err_drv_offload_bad_gpu_arch : Error<"unsupported %0 gpu architecture: %1">;
 def err_drv_offload_missing_gpu_arch : Error<
-  "Must pass in an explicit %0 gpu architecture to '%1'">;
+  "must pass in an explicit %0 gpu architecture to '%1'">;
 def err_drv_no_cuda_installation : Error<
   "cannot find CUDA installation; provide its path via '--cuda-path', or pass "
   "'-nocudainc' to build without CUDA includes">;
@@ -90,8 +90,8 @@ def err_drv_no_hipspv_device_lib : Error<
   "'--hip-path' or '--hip-device-lib-path', or pass '-nogpulib' to build "
   "without HIP device library">;
 def err_drv_hipspv_no_hip_path : Error<
-  "'--hip-path' must be specified when offloading to "
-  "SPIR-V%select{| unless %1 is given}0.">;
+  "'--hip-path' must be specified when offloading to SPIR-V unless '-nogpuinc' "
+  "is given">;
 
 // TODO: Remove when COV6 is fully supported by ROCm.
 def warn_drv_amdgpu_cov6: Warning<
@@ -137,13 +137,13 @@ def warn_drv_unsupported_option_for_flang : Warning<
   "the argument '%0' is not supported for option '%1'. Mapping to '%1%2'">,
   InGroup<OptionIgnored>;
 def warn_drv_unsupported_diag_option_for_flang : Warning<
-  "The warning option '-%0' is not supported">,
+  "the warning option '-%0' is not supported">,
   InGroup<OptionIgnored>;
 def warn_drv_unsupported_option_for_processor : Warning<
   "ignoring '%0' option as it is not currently supported for processor '%1'">,
   InGroup<OptionIgnored>;
 def warn_drv_unsupported_openmp_library : Warning<
-  "The library '%0=%1' is not supported, openmp is not be enabled">,
+  "the library '%0=%1' is not supported, OpenMP will not be enabled">,
   InGroup<OptionIgnored>;
 
 def err_drv_invalid_thread_model_for_target : Error<
@@ -356,7 +356,7 @@ def err_drv_expecting_fopenmp_with_fopenmp_targets : Error<
   "compatible with offloading; e.g., '-fopenmp=libomp' or '-fopenmp=libiomp5'">;
 def err_drv_failed_to_deduce_target_from_arch : Error<
   "failed to deduce triple for target architecture '%0'; specify the triple "
-  "using '-fopenmp-targets' and '-Xopenmp-target' instead.">;
+  "using '-fopenmp-targets' and '-Xopenmp-target' instead">;
 def err_drv_omp_offload_target_missingbcruntime : Error<
   "no library '%0' found in the default clang lib directory or in LIBRARY_PATH"
   "; use '--libomptarget-%1-bc-path' to specify %1 bitcode library">;
@@ -515,14 +515,6 @@ def err_analyzer_checker_incompatible_analyzer_option : Error<
 def err_analyzer_not_built_with_z3 : Error<
   "analyzer constraint manager 'z3' is only available if LLVM was built with "
   "-DLLVM_ENABLE_Z3_SOLVER=ON">;
-def warn_analyzer_deprecated_option : Warning<
-  "analyzer option '%0' is deprecated. This flag will be removed in %1, and "
-  "passing this option will be an error.">,
-  InGroup<DeprecatedStaticAnalyzerFlag>;
-def warn_analyzer_deprecated_option_with_alternative : Warning<
-  "analyzer option '%0' is deprecated. This flag will be removed in %1, and "
-  "passing this option will be an error. Use '%2' instead.">,
-  InGroup<DeprecatedStaticAnalyzerFlag>;
 
 def warn_drv_needs_hvx : Warning<
   "%0 requires HVX, use -mhvx/-mhvx= to enable it">,
@@ -555,10 +547,12 @@ def err_drv_extract_api_wrong_kind : Error<
   "in api extraction; use '-x %2' to override">;
 
 def err_drv_missing_symbol_graph_dir: Error<
-  "Must provide a symbol graph output directory using --symbol-graph-dir=<directory>">;
+  "must provide a symbol graph output directory using "
+  "'--symbol-graph-dir=<directory>'">;
 
 def err_drv_unexpected_symbol_graph_output : Error<
-  "Unexpected output symbol graph '%1'; please provide --symbol-graph-dir=<directory> instead">;
+  "unexpected output symbol graph '%1'; please provide "
+  "'--symbol-graph-dir=<directory>' instead">;
 
 def warn_slash_u_filename : Warning<"'/U%0' treated as the '/U' option">,
   InGroup<DiagGroup<"slash-u-filename">>;
@@ -599,9 +593,6 @@ def warn_drv_unsupported_gpopt : Warning<
   "ignoring '-mgpopt' option as it cannot be used with %select{|the implicit"
   " usage of }0-mabicalls">,
   InGroup<UnsupportedGPOpt>;
-def warn_drv_unsupported_tocdata: Warning<
-  "ignoring '-mtocdata' as it is only supported for -mcmodel=small">,
-  InGroup<OptionIgnored>;
 def warn_drv_unsupported_sdata : Warning<
   "ignoring '-msmall-data-limit=' with -mcmodel=large for -fpic or RV64">,
   InGroup<OptionIgnored>;
@@ -770,19 +761,19 @@ def err_drv_hlsl_16bit_types_unsupported: Error<
   "'%0' option requires target HLSL Version >= 2018%select{| and shader model >= 6.2}1, but HLSL Version is '%2'%select{| and shader model is '%3'}1">;
 def err_drv_hlsl_bad_shader_unsupported : Error<
   "%select{shader model|Vulkan environment|shader stage}0 '%1' in target '%2' is invalid for HLSL code generation">;
-def warn_drv_dxc_missing_dxv : Warning<"dxv not found. "
-    "Resulting DXIL will not be validated or signed for use in release environments.">,
-    InGroup<DXILValidation>;
+def warn_drv_dxc_missing_dxv : Warning<
+  "dxv not found; resulting DXIL will not be validated or signed for use in "
+  "release environment">, InGroup<DXILValidation>;
 
 def err_drv_invalid_range_dxil_validator_version : Error<
-  "invalid validator version : %0\n"
-  "Validator version must be less than or equal to current internal version.">;
+  "invalid validator version : %0; validator version must be less than or "
+  "equal to current internal version">;
 def err_drv_invalid_format_dxil_validator_version : Error<
-  "invalid validator version : %0\n"
-  "Format of validator version is \"<major>.<minor>\" (ex:\"1.4\").">;
+  "invalid validator version : %0; format of validator version is "
+  "\"<major>.<minor>\" (ex:\"1.4\")">;
 def err_drv_invalid_empty_dxil_validator_version : Error<
-  "invalid validator version : %0\n"
-  "If validator major version is 0, minor version must also be 0.">;
+  "invalid validator version : %0; if validator major version is 0, minor "
+  "version must also be 0">;
 
 def warn_drv_sarif_format_unstable : Warning<
   "diagnostic formatting in SARIF mode is currently unstable">,
@@ -796,12 +787,10 @@ def warn_drv_loongarch_conflicting_implied_val : Warning<
   InGroup<OptionIgnored>;
 def err_drv_loongarch_invalid_mfpu_EQ : Error<
   "invalid argument '%0' to -mfpu=; must be one of: 64, 32, none, 0 (alias for none)">;
-def err_drv_loongarch_wrong_fpu_width_for_lsx : Error<
-  "wrong fpu width; LSX depends on 64-bit FPU.">;
-def err_drv_loongarch_wrong_fpu_width_for_lasx : Error<
-  "wrong fpu width; LASX depends on 64-bit FPU.">;
+def err_drv_loongarch_wrong_fpu_width : Error<
+  "wrong fpu width; %select{LSX|LASX}0 depends on 64-bit FPU">;
 def err_drv_loongarch_invalid_simd_option_combination : Error<
-  "invalid option combination; LASX depends on LSX.">;
+  "invalid option combination; LASX depends on LSX">;
 
 def err_drv_expand_response_file : Error<
   "failed to expand response file: %0">;
@@ -813,9 +802,9 @@ def note_drv_available_multilibs : Note<
   "available multilibs are:%0">;
 
 def warn_android_unversioned_fallback : Warning<
-  "Using unversioned Android target directory %0 for target %1. Unversioned"
-  " directories will not be used in Clang 19. Provide a versioned directory"
-  " for the target version or lower instead.">,
+  "using unversioned Android target directory %0 for target %1; unversioned "
+  "directories will not be used in Clang 19 -- provide a versioned directory "
+  "for the target version or lower instead">,
   InGroup<DiagGroup<"android-unversioned-fallback">>;
 
 def err_drv_triple_version_invalid : Error<
diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index e456ec2cac46..85c32e55bdab 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -71,14 +71,14 @@ def remark_fe_backend_optimization_remark_analysis : Remark<"%0">, BackendInfo,
     InGroup<BackendOptimizationRemarkAnalysis>;
 def remark_fe_backend_optimization_remark_analysis_fpcommute : Remark<"%0; "
     "allow reordering by specifying '#pragma clang loop vectorize(enable)' "
-    "before the loop or by providing the compiler option '-ffast-math'.">,
+    "before the loop or by providing the compiler option '-ffast-math'">,
     BackendInfo, InGroup<BackendOptimizationRemarkAnalysis>;
 def remark_fe_backend_optimization_remark_analysis_aliasing : Remark<"%0; "
     "allow reordering by specifying '#pragma clang loop vectorize(enable)' "
-    "before the loop. If the arrays will always be independent specify "
+    "before the loop; if the arrays will always be independent, specify "
     "'#pragma clang loop vectorize(assume_safety)' before the loop or provide "
-    "the '__restrict__' qualifier with the independent array arguments. "
-    "Erroneous results will occur if these options are incorrectly applied!">,
+    "the '__restrict__' qualifier with the independent array arguments -- "
+    "erroneous results will occur if these options are incorrectly applied">,
     BackendInfo, InGroup<BackendOptimizationRemarkAnalysis>;
 
 def warn_fe_backend_optimization_failure : Warning<"%0">, BackendInfo,
@@ -152,8 +152,8 @@ def warn_fe_serialized_diag_merge_failure : Warning<
 def warn_fe_serialized_diag_failure : Warning<
     "unable to open file %0 for serializing diagnostics (%1)">,
     InGroup<SerializedDiagnostics>;
-def warn_fe_serialized_diag_failure_during_finalisation : Warning<
-    "Received warning after diagnostic serialization teardown was underway: %0">,
+def warn_fe_serialized_diag_failure_during_finalization : Warning<
+    "received warning after diagnostic serialization teardown was underway: %0">,
     InGroup<SerializedDiagnostics>;
 
 def err_verify_missing_line : Error<
@@ -337,7 +337,7 @@ def warn_atomic_op_oversized : Warning<
 InGroup<AtomicAlignment>;
 
 def warn_sync_op_misaligned : Warning<
-  "__sync builtin operation MUST have natural alignment (consider using __atomic).">,
+  "__sync builtin operation must have natural alignment (consider using __atomic)">,
   InGroup<SyncAlignment>;
 
 def warn_alias_with_section : Warning<
@@ -359,17 +359,16 @@ def warn_profile_data_unprofiled : Warning<
   "no profile data available for file \"%0\"">,
   InGroup<ProfileInstrUnprofiled>;
 def warn_profile_data_misexpect : Warning<
-  "Potential performance regression from use of __builtin_expect(): "
-  "Annotation was correct on %0 of profiled executions.">,
-  BackendInfo,
-  InGroup<MisExpect>;
+  "potential performance regression from use of __builtin_expect(): "
+  "annotation was correct on %0 of profiled executions">,
+  BackendInfo, InGroup<MisExpect>;
 } // end of instrumentation issue category
 
 def err_extract_api_ignores_file_not_found :
   Error<"file '%0' specified by '--extract-api-ignores=' not found">, DefaultFatal;
 
 def warn_missing_symbol_graph_dir : Warning<
-  "Missing symbol graph output directory, defaulting to working directory">,
+  "missing symbol graph output directory, defaulting to working directory">,
   InGroup<ExtractAPIMisuse>;
 
 def err_ast_action_on_llvm_ir : Error<
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 4cb4f3d999f7..6b595a356793 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -15,8 +15,6 @@ def Implicit : DiagGroup<"implicit", [
     ImplicitInt
 ]>;
 
-def DeprecatedStaticAnalyzerFlag : DiagGroup<"deprecated-static-analyzer-flag">;
-
 // Empty DiagGroups are recognized by clang but ignored.
 def ODR : DiagGroup<"odr">;
 def : DiagGroup<"abi">;
@@ -1447,6 +1445,10 @@ def FunctionMultiVersioning
 
 def NoDeref : DiagGroup<"noderef">;
 
+// -fbounds-safety and bounds annotation related warnings
+def BoundsSafetyCountedByEltTyUnknownSize :
+  DiagGroup<"bounds-safety-counted-by-elt-type-unknown-size">;
+
 // A group for cross translation unit static analysis related warnings.
 def CrossTU : DiagGroup<"ctu">;
 
diff --git a/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td b/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
index 944b2a38b6e9..cdf27247602f 100644
--- a/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
+++ b/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
@@ -59,8 +59,8 @@ def err_platform_mismatch : Error<"platform does not match: '%0' (provided) vs '
 def err_install_name_mismatch : Error<"install_name does not match: '%0' (provided) vs '%1' (found)">;
 def err_current_version_mismatch : Error<"current_version does not match: '%0' (provided) vs '%1' (found)">;
 def err_compatibility_version_mismatch : Error<"compatibility_version does not match: '%0' (provided) vs '%1' (found)">;
-def err_appextension_safe_mismatch : Error<"ApplicationExtensionSafe flag does not match: '%0' (provided) vs '%1' (found)">;
-def err_shared_cache_eligiblity_mismatch : Error<"NotForDyldSharedCache flag does not match: '%0' (provided) vs '%1' (found)">;
+def err_appextension_safe_mismatch : Error<"the ApplicationExtensionSafe flag does not match: '%0' (provided) vs '%1' (found)">;
+def err_shared_cache_eligiblity_mismatch : Error<"the NotForDyldSharedCache flag does not match: '%0' (provided) vs '%1' (found)">;
 def err_no_twolevel_namespace : Error<"flat namespace libraries are not supported">;
 def err_parent_umbrella_missing: Error<"parent umbrella missing from %0: '%1'">;
 def err_parent_umbrella_mismatch : Error<"parent umbrella does not match: '%0' (provided) vs '%1' (found)">;
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index ad6bacfb118d..5a4551a96ca4 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -991,5 +991,5 @@ def err_pp_unclosed_pragma_unsafe_buffer_usage :
 Error<"'#pragma unsafe_buffer_usage' was not ended">;
 
 def err_pp_pragma_unsafe_buffer_usage_syntax :
-Error<"Expected 'begin' or 'end'">;
+Error<"expected 'begin' or 'end'">;
 }
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 46656fc66044..f8328be5890d 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1117,7 +1117,7 @@ def err_availability_expected_environment : Error<
 
 // objc_bridge_related attribute
 def err_objcbridge_related_expected_related_class : Error<
-  "expected a related ObjectiveC class name, e.g., 'NSColor'">;
+  "expected a related Objective-C class name, e.g., 'NSColor'">;
 def err_objcbridge_related_selector_name : Error<
   "expected a class method selector with single argument, e.g., 'colorWithCGColor:'">;
 
@@ -1345,8 +1345,8 @@ def note_pragma_attribute_namespace_on_attribute : Note<
   "omit the namespace to add attributes to the most-recently"
   " pushed attribute group">;
 def warn_no_support_for_eval_method_source_on_m32 : Warning<
-  "Setting the floating point evaluation method to `source` on a target"
-  " without SSE is not supported.">, InGroup<Pragmas>;
+  "setting the floating point evaluation method to `source` on a target "
+  "without SSE is not supported">, InGroup<Pragmas>;
 // - #pragma __debug
 def warn_pragma_debug_dependent_argument : Warning<
   "%select{value|type}0-dependent expression passed as an argument to debug "
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index c7dea1d54d06..270b0a1e0130 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -310,7 +310,7 @@ def err_invalid_vector_long_double_decl_spec : Error<
 def err_invalid_vector_complex_decl_spec : Error<
   "cannot use '_Complex' with '__vector'">;
 def warn_vector_long_decl_spec_combination : Warning<
-  "Use of 'long' with '__vector' is deprecated">, InGroup<Deprecated>;
+  "use of 'long' with '__vector' is deprecated">, InGroup<Deprecated>;
 
 def err_redeclaration_different_type : Error<
   "redeclaration of %0 with a different type%diff{: $ vs $|}1,2">;
@@ -754,7 +754,7 @@ def note_include_header_or_declare : Note<
 def note_previous_builtin_declaration : Note<"%0 is a builtin with type %1">;
 def warn_implicit_decl_no_jmp_buf
     : Warning<"declaration of built-in function '%0' requires the declaration"
-    " of the 'jmp_buf' type, commonly provided in the header <setjmp.h>.">,
+    " of the 'jmp_buf' type, commonly provided in the header <setjmp.h>">,
       InGroup<DiagGroup<"incomplete-setjmp-declaration">>;
 def warn_implicit_decl_requires_sysheader : Warning<
   "declaration of built-in function '%1' requires inclusion of the header <%0>">,
@@ -3197,7 +3197,7 @@ def err_attribute_bad_sve_vector_size : Error<
   "'-msve-vector-bits' ('%1')">;
 def err_attribute_arm_feature_sve_bits_unsupported : Error<
   "%0 is only supported when '-msve-vector-bits=<bits>' is specified with a "
-  "value of 128, 256, 512, 1024 or 2048.">;
+  "value of 128, 256, 512, 1024 or 2048">;
 def warn_attribute_arm_sm_incompat_builtin : Warning<
   "builtin call has undefined behaviour when called from a %0 function">,
   InGroup<DiagGroup<"undefined-arm-streaming">>;
@@ -3975,7 +3975,7 @@ def warn_acquired_before : Warning<
   "%0 '%1' must be acquired before '%2'">,
   InGroup<ThreadSafetyAnalysis>, DefaultIgnore;
 def warn_acquired_before_after_cycle : Warning<
-  "Cycle in acquired_before/after dependencies, starting with '%0'">,
+  "cycle in acquired_before/after dependencies, starting with '%0'">,
   InGroup<ThreadSafetyAnalysis>, DefaultIgnore;
 
 
@@ -4526,7 +4526,7 @@ def err_objc_attr_typedef_not_void_pointer : Error<
 def err_objc_cf_bridged_not_interface : Error<
   "CF object of type %0 is bridged to %1, which is not an Objective-C class">;
 def err_objc_ns_bridged_invalid_cfobject : Error<
-  "ObjectiveC object of type %0 is bridged to %1, which is not valid CF object">;
+  "Objective-C object of type %0 is bridged to %1, which is not valid CF object">;
 def warn_objc_invalid_bridge : Warning<
   "%0 bridges to %1, not %2">, InGroup<ObjCBridge>;
 def warn_objc_invalid_bridge_to_cf : Warning<
@@ -6544,8 +6544,10 @@ def warn_superclass_variable_sized_type_not_at_end : Warning<
 
 def err_flexible_array_count_not_in_same_struct : Error<
   "'counted_by' field %0 isn't within the same struct as the flexible array">;
-def err_counted_by_attr_not_on_flexible_array_member : Error<
-  "'counted_by' only applies to C99 flexible array members">;
+def err_counted_by_attr_not_on_ptr_or_flexible_array_member : Error<
+  "'counted_by' only applies to pointers or C99 flexible array members">;
+def err_counted_by_attr_on_array_not_flexible_array_member : Error<
+  "'counted_by' on arrays only applies to C99 flexible array members">;
 def err_counted_by_attr_refer_to_itself : Error<
   "'counted_by' cannot refer to the flexible array member %0">;
 def err_counted_by_must_be_in_structure : Error<
@@ -6560,6 +6562,23 @@ def err_counted_by_attr_refer_to_union : Error<
   "'counted_by' argument cannot refer to a union member">;
 def note_flexible_array_counted_by_attr_field : Note<
   "field %0 declared here">;
+def err_counted_by_attr_pointee_unknown_size : Error<
+  "'counted_by' %select{cannot|should not}3 be applied to %select{"
+    "a pointer with pointee|" // pointer
+    "an array with element}0" // array
+  " of unknown size because %1 is %select{"
+    "an incomplete type|"  // CountedByInvalidPointeeTypeKind::INCOMPLETE
+    "a sizeless type|"     // CountedByInvalidPointeeTypeKind::SIZELESS
+    "a function type|"     // CountedByInvalidPointeeTypeKind::FUNCTION
+    // CountedByInvalidPointeeTypeKind::FLEXIBLE_ARRAY_MEMBER
+    "a struct type with a flexible array member"
+    "%select{|. This will be an error in a future compiler version}3"
+    ""
+  "}2">;
+
+def warn_counted_by_attr_elt_type_unknown_size :
+  Warning<err_counted_by_attr_pointee_unknown_size.Summary>,
+  InGroup<BoundsSafetyCountedByEltTyUnknownSize>;
 
 let CategoryName = "ARC Semantic Issue" in {
 
@@ -7525,6 +7544,11 @@ def err_explicit_object_parameter_mutable: Error<
 def err_invalid_explicit_object_type_in_lambda: Error<
   "invalid explicit object parameter type %0 in lambda with capture; "
   "the type must be the same as, or derived from, the lambda">;
+def err_explicit_object_lambda_ambiguous_base : Error<
+  "lambda %0 is inaccessible due to ambiguity:%1">;
+def err_explicit_object_lambda_inaccessible_base : Error<
+  "invalid explicit object parameter type %0 in lambda with capture; "
+  "the type must derive publicly from the lambda">;
 
 def err_ref_qualifier_overload : Error<
   "cannot overload a member function %select{without a ref-qualifier|with "
@@ -7997,15 +8021,15 @@ def warn_deprecated_volatile_structured_binding : Warning<
   InGroup<DeprecatedVolatile>;
 
 def warn_deprecated_altivec_src_compat : Warning<
-  "Current handling of vector bool and vector pixel types in this context are "
-  "deprecated. The default behaviour will soon change to that implied by the "
+  "current handling of vector bool and vector pixel types in this context are "
+  "deprecated; the default behaviour will soon change to that implied by the "
   "'-altivec-compat=xl' option">,
   InGroup<DiagGroup<"deprecated-altivec-src-compat">>;
 
 def warn_deprecated_lax_vec_conv_all : Warning<
-  "Implicit conversion between vector types ('%0' and '%1') is deprecated. "
-  "In the future, the behavior implied by '-fno-lax-vector-conversions' "
-  "will be the default.">,
+  "implicit conversion between vector types ('%0' and '%1') is deprecated; "
+  "in the future, the behavior implied by '-fno-lax-vector-conversions' "
+  "will be the default">,
   InGroup<DiagGroup<"deprecate-lax-vec-conv-all">>;
 
 def err_catch_incomplete_ptr : Error<
@@ -8853,7 +8877,7 @@ def err_atomic_exclusive_builtin_pointer_size : Error<
   "address argument to load or store exclusive builtin must be a pointer to"
   " 1,2,4 or 8 byte type (%0 invalid)">;
 def err_atomic_builtin_ext_int_size : Error<
-  "Atomic memory operand must have a power-of-two size">;
+  "atomic memory operand must have a power-of-two size">;
 def err_atomic_builtin_bit_int_prohibit : Error<
   "argument to atomic builtin of type '_BitInt' is not supported">;
 def err_atomic_op_needs_atomic : Error<
@@ -8961,8 +8985,8 @@ def err_va_arg_in_device : Error<
 def err_alias_not_supported_on_nvptx : Error<"CUDA older than 10.0 does not support .alias">;
 def err_cuda_unattributed_constexpr_cannot_overload_device : Error<
   "constexpr function %0 without __host__ or __device__ attributes cannot "
-  "overload __device__ function with same signature.  Add a __host__ "
-  "attribute, or build with -fno-cuda-host-device-constexpr.">;
+  "overload __device__ function with the same signature; add a __host__ "
+  "attribute, or build with -fno-cuda-host-device-constexpr">;
 def note_cuda_conflicting_device_function_declared_here : Note<
   "conflicting __device__ function declared here">;
 def err_cuda_device_exceptions : Error<
@@ -8970,9 +8994,9 @@ def err_cuda_device_exceptions : Error<
   "%select{__device__|__global__|__host__|__host__ __device__}1 function">;
 def err_dynamic_var_init : Error<
     "dynamic initialization is not supported for "
-    "__device__, __constant__, __shared__, and __managed__ variables.">;
+    "__device__, __constant__, __shared__, and __managed__ variables">;
 def err_shared_var_init : Error<
-    "initialization is not supported for __shared__ variables.">;
+    "initialization is not supported for __shared__ variables">;
 def err_cuda_vla : Error<
     "cannot use variable-length arrays in "
     "%select{__device__|__global__|__host__|__host__ __device__}0 functions">;
@@ -10056,12 +10080,6 @@ def warn_new_dangling_initializer_list : Warning<
   "the allocated initializer list}0 "
   "will be destroyed at the end of the full-expression">,
   InGroup<DanglingInitializerList>;
-def warn_unsupported_lifetime_extension : Warning<
-  "lifetime extension of "
-  "%select{temporary|backing array of initializer list}0 created "
-  "by aggregate initialization using a default member initializer "
-  "is not yet supported; lifetime of %select{temporary|backing array}0 "
-  "will end at the end of the full-expression">, InGroup<Dangling>;
 
 // For non-floating point, expressions of the form x == x or x != x
 // should result in a warning, since these always evaluate to a constant.
@@ -10237,9 +10255,6 @@ def err_fallthrough_attr_outside_switch : Error<
 def err_fallthrough_attr_invalid_placement : Error<
   "fallthrough annotation does not directly precede switch label">;
 
-def err_assume_attr_args : Error<
-  "attribute '%0' requires a single expression argument">;
-
 def warn_unreachable_default : Warning<
   "default label in switch which covers all enumeration values">,
   InGroup<CoveredSwitchDefault>, DefaultIgnore;
@@ -10365,12 +10380,12 @@ def err_shufflevector_argument_too_large : Error<
   "index for __builtin_shufflevector must be less than the total number "
   "of vector elements">;
 def err_shufflevector_minus_one_is_undefined_behavior_constexpr : Error<
-  "index for __builtin_shufflevector not within the bounds of the input vectors; index of -1 found at position %0 not permitted in a constexpr context.">;
+  "index for __builtin_shufflevector not within the bounds of the input vectors; index of -1 found at position %0 is not permitted in a constexpr context">;
 
 def err_convertvector_non_vector : Error<
   "first argument to __builtin_convertvector must be a vector">;
 def err_convertvector_constexpr_unsupported_vector_cast : Error<
-  "unsupported vector cast from %0 to %1 in a constant expression.">;
+  "unsupported vector cast from %0 to %1 in a constant expression">;
 def err_builtin_non_vector_type : Error<
   "%0 argument to %1 must be of vector type">;
 def err_convertvector_incompatible_vector : Error<
@@ -10698,7 +10713,7 @@ def err_kernel_arg_address_space : Error<
   "pointer arguments to kernel functions must reside in '__global', "
   "'__constant' or '__local' address space">;
 def err_opencl_ext_vector_component_invalid_length : Error<
-  "vector component access has invalid length %0.  Supported: 1,2,3,4,8,16.">;
+  "vector component access has invalid length %0; supported lengths are: 1,2,3,4,8,16">;
 def err_opencl_function_variable : Error<
   "%select{non-kernel function|function scope}0 variable cannot be declared in %1 address space">;
 def err_opencl_addrspace_scope : Error<
@@ -11146,12 +11161,12 @@ def err_omp_atomic_compare : Error<
   "the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}',"
   " '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}',"
   " 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type,"
-  " and 'ordop' is one of '<' or '>'.">;
+  " and 'ordop' is one of '<' or '>'">;
 def err_omp_atomic_compare_capture : Error<
   "the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}',"
   " '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}',"
   " 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x', 'r', and 'v' are lvalue expressions with scalar type, 'expr', 'e', and 'd' are expressions with scalar type,"
-  " and 'ordop' is one of '<' or '>'.">;
+  " and 'ordop' is one of '<' or '>'">;
 def note_omp_atomic_compare: Note<
   "%select{expected compound statement|expected exactly one expression statement|expected assignment statement|expected conditional operator|expect result value to be at false expression|"
   "expect binary operator in conditional expression|expect '<', '>' or '==' as order operator|expect comparison in a form of 'x == e', 'e == x', 'x ordop expr', or 'expr ordop x'|"
@@ -11317,7 +11332,7 @@ def err_omp_expected_int_param : Error<
 def err_omp_at_least_one_motion_clause_required : Error<
   "expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'">;
 def err_omp_cannot_update_with_internal_linkage : Error<
-  "the host cannot update a declare target variable that is not externally visible.">;
+  "the host cannot update a declare target variable that is not externally visible">;
 def err_omp_usedeviceptr_not_a_pointer : Error<
   "expected pointer or reference to pointer in 'use_device_ptr' clause">;
 def err_omp_argument_type_isdeviceptr : Error <
@@ -11338,10 +11353,10 @@ def err_omp_reduction_vla_unsupported : Error<
 def err_omp_linear_distribute_var_non_loop_iteration : Error<
   "only loop iteration variables are allowed in 'linear' clause in distribute directives">;
 def warn_omp_non_trivial_type_mapped : Warning<
-  "Type %0 is not trivially copyable and not guaranteed to be mapped correctly">,
+  "type %0 is not trivially copyable and not guaranteed to be mapped correctly">,
   InGroup<OpenMPMapping>;
 def err_omp_requires_clause_redeclaration : Error <
-  "Only one %0 clause can appear on a requires directive in a single translation unit">;
+  "only one %0 clause can appear on a requires directive in a single translation unit">;
 def note_omp_requires_previous_clause : Note <
   "%0 clause previously used here">;
 def err_omp_directive_before_requires : Error <
@@ -11349,7 +11364,7 @@ def err_omp_directive_before_requires : Error <
 def note_omp_requires_encountered_directive : Note <
   "'%0' previously encountered here">;
 def err_omp_device_ancestor_without_requires_reverse_offload : Error <
-  "Device clause with ancestor device-modifier used without specifying 'requires reverse_offload'">;
+  "device clause with ancestor device-modifier used without specifying 'requires reverse_offload'">;
 def err_omp_invalid_scope : Error <
   "'#pragma omp %0' directive must appear only in file scope">;
 def note_omp_invalid_length_on_this_ptr_mapping : Note <
@@ -11761,7 +11776,7 @@ def note_await_ready_no_bool_conversion : Note<
   "return type of 'await_ready' is required to be contextually convertible to 'bool'"
 >;
 def warn_coroutine_handle_address_invalid_return_type : Warning <
-  "return type of 'coroutine_handle<>::address should be 'void*' (have %0) in order to get capability with existing async C API.">,
+  "return type of 'coroutine_handle<>::address should be 'void*' (have %0) in order to get capability with existing async C API">,
   InGroup<Coroutine>;
 def err_coroutine_promise_final_suspend_requires_nothrow : Error<
   "the expression 'co_await __promise.final_suspend()' is required to be non-throwing"
@@ -11789,7 +11804,7 @@ def err_conflicting_aligned_options : Error <
   "conflicting option '-fcoro-aligned-allocation' and '-fno-aligned-allocation'"
 >;
 def err_coro_invalid_addr_of_label : Error<
-  "the GNU address of label extension is not allowed in coroutines."
+  "the GNU address of label extension is not allowed in coroutines"
 >;
 def err_coroutine_return_type : Error<
   "function returns a type %0 marked with [[clang::coro_return_type]] but is neither a coroutine nor a coroutine wrapper; "
@@ -12389,4 +12404,8 @@ def err_acc_reduction_composite_type
 def err_acc_reduction_composite_member_type :Error<
     "OpenACC 'reduction' composite variable must not have non-scalar field">;
 def note_acc_reduction_composite_member_loc : Note<"invalid field is here">;
+
+// AMDGCN builtins diagnostics
+def err_amdgcn_global_load_lds_size_invalid_value : Error<"invalid size value">;
+def note_amdgcn_global_load_lds_size_valid_value : Note<"size must be 1, 2, or 4">;
 } // end of sema component.
diff --git a/clang/include/clang/Basic/FileManager.h b/clang/include/clang/Basic/FileManager.h
index 8b4206e52cd4..e1f33d57a898 100644
--- a/clang/include/clang/Basic/FileManager.h
+++ b/clang/include/clang/Basic/FileManager.h
@@ -299,6 +299,8 @@ private:
   getBufferForFileImpl(StringRef Filename, int64_t FileSize, bool isVolatile,
                        bool RequiresNullTerminator) const;
 
+  DirectoryEntry *&getRealDirEntry(const llvm::vfs::Status &Status);
+
 public:
   /// Get the 'stat' information for the given \p Path.
   ///
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 09eb92d6f10d..4061451b2150 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -300,6 +300,7 @@ LANGOPT(HIPStdParInterposeAlloc, 1, 0, "Replace allocations / deallocations with
 
 LANGOPT(OpenACC           , 1, 0, "OpenACC Enabled")
 
+LANGOPT(MSVCEnableStdcMacro , 1, 0, "Define __STDC__ with '-fms-compatibility'")
 LANGOPT(SizedDeallocation , 1, 0, "sized deallocation")
 LANGOPT(AlignedAllocation , 1, 0, "aligned allocation")
 LANGOPT(AlignedAllocationUnavailable, 1, 0, "aligned allocation functions are unavailable")
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index a9ea71cd0777..03570f94de66 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2186,9 +2186,6 @@ let TargetGuard = "sme2" in {
 
   def SVSQRSHRUN_X4 : SInst<"svqrshrun[_n]_{0}[_{d}_x4]", "b4i", "il", MergeNone, "aarch64_sve_sqrshrun_x4", [IsStreaming], [ImmCheck<1, ImmCheckShiftRight, 0>]>;
 
-  def REINTERPRET_SVBOOL_TO_SVCOUNT : Inst<"svreinterpret[_c]", "}P", "Pc", MergeNone, "", [IsStreamingCompatible], []>;
-  def REINTERPRET_SVCOUNT_TO_SVBOOL : Inst<"svreinterpret[_b]", "P}", "Pc", MergeNone, "", [IsStreamingCompatible], []>;
-
   // SQDMULH
   def SVSQDMULH_SINGLE_X2 : SInst<"svqdmulh[_single_{d}_x2]", "22d", "csil", MergeNone, "aarch64_sve_sqdmulh_single_vgx2", [IsStreaming], []>;
   def SVSQDMULH_SINGLE_X4 : SInst<"svqdmulh[_single_{d}_x4]", "44d", "csil", MergeNone, "aarch64_sve_sqdmulh_single_vgx4", [IsStreaming], []>;
@@ -2197,6 +2194,9 @@ let TargetGuard = "sme2" in {
 }
 
 let TargetGuard = "sve2p1|sme2" in {
+  def REINTERPRET_SVBOOL_TO_SVCOUNT : Inst<"svreinterpret[_c]", "}P", "Pc", MergeNone, "", [IsStreamingCompatible], []>;
+  def REINTERPRET_SVCOUNT_TO_SVBOOL : Inst<"svreinterpret[_b]", "P}", "Pc", MergeNone, "", [IsStreamingCompatible], []>;
+
   // SQRSHRN / UQRSHRN
   def SVQRSHRN_X2   : SInst<"svqrshrn[_n]_{0}[_{d}_x2]", "h2i", "i",    MergeNone, "aarch64_sve_sqrshrn_x2", [IsStreamingCompatible], [ImmCheck<1, ImmCheck1_16>]>;
   def SVUQRSHRN_X2  : SInst<"svqrshrn[_n]_{0}[_{d}_x2]", "e2i", "Ui",   MergeNone, "aarch64_sve_uqrshrn_x2", [IsStreamingCompatible], [ImmCheck<1, ImmCheck1_16>]>;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 7bb781667e92..de2f245fb29f 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -603,6 +603,7 @@ class MarshallingInfoVisibility<KeyPathAndMacro kpm, code default>
 // Key paths that are constant during parsing of options with the same key path prefix.
 defvar cplusplus = LangOpts<"CPlusPlus">;
 defvar cpp11 = LangOpts<"CPlusPlus11">;
+defvar cpp14 = LangOpts<"CPlusPlus14">;
 defvar cpp17 = LangOpts<"CPlusPlus17">;
 defvar cpp20 = LangOpts<"CPlusPlus20">;
 defvar c99 = LangOpts<"C99">;
@@ -2980,6 +2981,10 @@ def fms_compatibility : Flag<["-"], "fms-compatibility">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option, CLOption]>,
   HelpText<"Enable full Microsoft Visual C++ compatibility">,
   MarshallingInfoFlag<LangOpts<"MSVCCompat">>;
+def fms_define_stdc : Flag<["-"], "fms-define-stdc">, Group<f_Group>,
+  Visibility<[ClangOption, CC1Option, CLOption]>,
+  HelpText<"Define '__STDC__' to '1' in MSVC Compatibility mode">,
+  MarshallingInfoFlag<LangOpts<"MSVCEnableStdcMacro">>;
 def fms_extensions : Flag<["-"], "fms-extensions">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option, CLOption]>,
   HelpText<"Accept some non-standard constructs supported by the Microsoft compiler">,
@@ -3388,10 +3393,9 @@ defm relaxed_template_template_args : BoolFOption<"relaxed-template-template-arg
   NegFlag<SetFalse, [], [CC1Option], "Disable">,
   BothFlags<[], [ClangOption], " C++17 relaxed template template argument matching">>;
 defm sized_deallocation : BoolFOption<"sized-deallocation",
-  LangOpts<"SizedDeallocation">, DefaultFalse,
-  PosFlag<SetTrue, [], [ClangOption, CC1Option],
-          "Enable C++14 sized global deallocation functions">,
-  NegFlag<SetFalse>>;
+  LangOpts<"SizedDeallocation">, Default<cpp14.KeyPath>,
+  PosFlag<SetTrue, [], [], "Enable C++14 sized global deallocation functions">,
+  NegFlag<SetFalse>, BothFlags<[], [ClangOption, CC1Option]>>;
 defm aligned_allocation : BoolFOption<"aligned-allocation",
   LangOpts<"AlignedAllocation">, Default<cpp17.KeyPath>,
   PosFlag<SetTrue, [], [ClangOption], "Enable C++17 aligned allocation functions">,
@@ -6111,14 +6115,10 @@ def mavx512cd : Flag<["-"], "mavx512cd">, Group<m_x86_Features_Group>;
 def mno_avx512cd : Flag<["-"], "mno-avx512cd">, Group<m_x86_Features_Group>;
 def mavx512dq : Flag<["-"], "mavx512dq">, Group<m_x86_Features_Group>;
 def mno_avx512dq : Flag<["-"], "mno-avx512dq">, Group<m_x86_Features_Group>;
-def mavx512er : Flag<["-"], "mavx512er">, Group<m_x86_Features_Group>;
-def mno_avx512er : Flag<["-"], "mno-avx512er">, Group<m_x86_Features_Group>;
 def mavx512fp16 : Flag<["-"], "mavx512fp16">, Group<m_x86_Features_Group>;
 def mno_avx512fp16 : Flag<["-"], "mno-avx512fp16">, Group<m_x86_Features_Group>;
 def mavx512ifma : Flag<["-"], "mavx512ifma">, Group<m_x86_Features_Group>;
 def mno_avx512ifma : Flag<["-"], "mno-avx512ifma">, Group<m_x86_Features_Group>;
-def mavx512pf : Flag<["-"], "mavx512pf">, Group<m_x86_Features_Group>;
-def mno_avx512pf : Flag<["-"], "mno-avx512pf">, Group<m_x86_Features_Group>;
 def mavx512vbmi : Flag<["-"], "mavx512vbmi">, Group<m_x86_Features_Group>;
 def mno_avx512vbmi : Flag<["-"], "mno-avx512vbmi">, Group<m_x86_Features_Group>;
 def mavx512vbmi2 : Flag<["-"], "mavx512vbmi2">, Group<m_x86_Features_Group>;
@@ -6209,8 +6209,6 @@ def mpopcnt : Flag<["-"], "mpopcnt">, Group<m_x86_Features_Group>;
 def mno_popcnt : Flag<["-"], "mno-popcnt">, Group<m_x86_Features_Group>;
 def mprefetchi : Flag<["-"], "mprefetchi">, Group<m_x86_Features_Group>;
 def mno_prefetchi : Flag<["-"], "mno-prefetchi">, Group<m_x86_Features_Group>;
-def mprefetchwt1 : Flag<["-"], "mprefetchwt1">, Group<m_x86_Features_Group>;
-def mno_prefetchwt1 : Flag<["-"], "mno-prefetchwt1">, Group<m_x86_Features_Group>;
 def mprfchw : Flag<["-"], "mprfchw">, Group<m_x86_Features_Group>;
 def mno_prfchw : Flag<["-"], "mno-prfchw">, Group<m_x86_Features_Group>;
 def mptwrite : Flag<["-"], "mptwrite">, Group<m_x86_Features_Group>;
@@ -8312,6 +8310,9 @@ def _SLASH_vd : CLJoined<"vd">, HelpText<"Control vtordisp placement">,
   Alias<vtordisp_mode_EQ>;
 def _SLASH_X : CLFlag<"X">,
   HelpText<"Do not add %INCLUDE% to include search path">, Alias<nostdlibinc>;
+def _SLASH_Zc___STDC__ : CLFlag<"Zc:__STDC__">,
+  HelpText<"Define __STDC__">,
+  Alias<fms_define_stdc>;
 def _SLASH_Zc_sizedDealloc : CLFlag<"Zc:sizedDealloc">,
   HelpText<"Enable C++14 sized global deallocation functions">,
   Alias<fsized_deallocation>;
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 3c4ab649e3b4..8493026f5f7a 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -1646,8 +1646,12 @@ private:
   void ParseLexedAttributes(ParsingClass &Class);
   void ParseLexedAttributeList(LateParsedAttrList &LAs, Decl *D,
                                bool EnterScope, bool OnDefinition);
+  void ParseLexedCAttributeList(LateParsedAttrList &LA, bool EnterScope,
+                                ParsedAttributes *OutAttrs = nullptr);
   void ParseLexedAttribute(LateParsedAttribute &LA,
                            bool EnterScope, bool OnDefinition);
+  void ParseLexedCAttribute(LateParsedAttribute &LA, bool EnterScope,
+                            ParsedAttributes *OutAttrs = nullptr);
   void ParseLexedMethodDeclarations(ParsingClass &Class);
   void ParseLexedMethodDeclaration(LateParsedMethodDeclaration &LM);
   void ParseLexedMethodDefs(ParsingClass &Class);
@@ -2534,7 +2538,8 @@ private:
 
   void ParseStructDeclaration(
       ParsingDeclSpec &DS,
-      llvm::function_ref<void(ParsingFieldDeclarator &)> FieldsCallback);
+      llvm::function_ref<Decl *(ParsingFieldDeclarator &)> FieldsCallback,
+      LateParsedAttrList *LateFieldAttrs = nullptr);
 
   DeclGroupPtrTy ParseTopLevelStmtDecl();
 
@@ -2814,7 +2819,7 @@ private:
                                        SourceLocation CorrectLocation);
 
   void stripTypeAttributesOffDeclSpec(ParsedAttributes &Attrs, DeclSpec &DS,
-                                      Sema::TagUseKind TUK);
+                                      TagUseKind TUK);
 
   // FixItLoc = possible correct location for the attributes
   void ProhibitAttributes(ParsedAttributes &Attrs,
@@ -2997,7 +3002,8 @@ private:
   bool ParseCXXAssumeAttributeArg(ParsedAttributes &Attrs,
                                   IdentifierInfo *AttrName,
                                   SourceLocation AttrNameLoc,
-                                  SourceLocation *EndLoc);
+                                  SourceLocation *EndLoc,
+                                  ParsedAttr::Form Form);
 
   IdentifierInfo *TryParseCXX11AttributeIdentifier(
       SourceLocation &Loc,
@@ -3112,6 +3118,8 @@ private:
                                  SourceLocation ScopeLoc,
                                  ParsedAttr::Form Form);
 
+  void DistributeCLateParsedAttrs(Decl *Dcl, LateParsedAttrList *LateAttrs);
+
   void ParseBoundsAttribute(IdentifierInfo &AttrName,
                             SourceLocation AttrNameLoc, ParsedAttributes &Attrs,
                             IdentifierInfo *ScopeName, SourceLocation ScopeLoc,
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 01ddba5eaf01..524737918180 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -175,7 +175,9 @@ class SemaObjC;
 class SemaOpenACC;
 class SemaOpenMP;
 class SemaPseudoObject;
+class SemaRISCV;
 class SemaSYCL;
+class SemaX86;
 class StandardConversionSequence;
 class Stmt;
 class StringLiteral;
@@ -447,6 +449,13 @@ enum class CheckedConversionKind {
   ForBuiltinOverloadedOp
 };
 
+enum class TagUseKind {
+  Reference,   // Reference to a tag:  'struct foo *X;'
+  Declaration, // Fwd decl of a tag:   'struct foo;'
+  Definition,  // Definition of a tag: 'struct foo { int X; } Y;'
+  Friend       // Friend declaration:  'friend struct foo;'
+};
+
 /// Sema - This implements semantic analysis and AST building for C.
 /// \nosubgrouping
 class Sema final : public SemaBase {
@@ -484,7 +493,6 @@ class Sema final : public SemaBase {
   // 29. Constraints and Concepts (SemaConcept.cpp)
   // 30. Types (SemaType.cpp)
   // 31. FixIt Helpers (SemaFixItUtils.cpp)
-  // 32. Name Lookup for RISC-V Vector Intrinsic (SemaRISCVVectorLookup.cpp)
 
   /// \name Semantic Analysis
   /// Implementations are in Sema.cpp
@@ -1020,11 +1028,21 @@ public:
     return *PseudoObjectPtr;
   }
 
+  SemaRISCV &RISCV() {
+    assert(RISCVPtr);
+    return *RISCVPtr;
+  }
+
   SemaSYCL &SYCL() {
     assert(SYCLPtr);
     return *SYCLPtr;
   }
 
+  SemaX86 &X86() {
+    assert(X86Ptr);
+    return *X86Ptr;
+  }
+
   /// Source of additional semantic information.
   IntrusiveRefCntPtr<ExternalSemaSource> ExternalSource;
 
@@ -1062,7 +1080,9 @@ private:
   std::unique_ptr<SemaOpenACC> OpenACCPtr;
   std::unique_ptr<SemaOpenMP> OpenMPPtr;
   std::unique_ptr<SemaPseudoObject> PseudoObjectPtr;
+  std::unique_ptr<SemaRISCV> RISCVPtr;
   std::unique_ptr<SemaSYCL> SYCLPtr;
+  std::unique_ptr<SemaX86> X86Ptr;
 
   ///@}
 
@@ -2037,6 +2057,23 @@ public:
 
   void CheckConstrainedAuto(const AutoType *AutoT, SourceLocation Loc);
 
+  bool BuiltinConstantArg(CallExpr *TheCall, int ArgNum, llvm::APSInt &Result);
+  bool BuiltinConstantArgRange(CallExpr *TheCall, int ArgNum, int Low, int High,
+                               bool RangeIsError = true);
+  bool BuiltinConstantArgMultiple(CallExpr *TheCall, int ArgNum,
+                                  unsigned Multiple);
+  bool BuiltinConstantArgPower2(CallExpr *TheCall, int ArgNum);
+  bool BuiltinConstantArgShiftedByte(CallExpr *TheCall, int ArgNum,
+                                     unsigned ArgBits);
+  bool BuiltinConstantArgShiftedByteOrXXFF(CallExpr *TheCall, int ArgNum,
+                                           unsigned ArgBits);
+
+  bool checkArgCountAtLeast(CallExpr *Call, unsigned MinArgCount);
+  bool checkArgCountAtMost(CallExpr *Call, unsigned MaxArgCount);
+  bool checkArgCountRange(CallExpr *Call, unsigned MinArgCount,
+                          unsigned MaxArgCount);
+  bool checkArgCount(CallExpr *Call, unsigned DesiredArgCount);
+
 private:
   void CheckArrayAccess(const Expr *BaseExpr, const Expr *IndexExpr,
                         const ArraySubscriptExpr *ASE = nullptr,
@@ -2092,24 +2129,10 @@ private:
                            CallExpr *TheCall);
   bool CheckMipsBuiltinArgument(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckSystemZBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
-  bool CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall);
-  bool CheckX86BuiltinGatherScatterScale(unsigned BuiltinID, CallExpr *TheCall);
-  bool CheckX86BuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall);
-  bool CheckX86BuiltinTileArgumentsRange(CallExpr *TheCall,
-                                         ArrayRef<int> ArgNums);
-  bool CheckX86BuiltinTileDuplicate(CallExpr *TheCall, ArrayRef<int> ArgNums);
-  bool CheckX86BuiltinTileRangeAndDuplicate(CallExpr *TheCall,
-                                            ArrayRef<int> ArgNums);
-  bool CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
-                                   CallExpr *TheCall);
   bool CheckPPCBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
                                    CallExpr *TheCall);
   bool CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
-  bool CheckRISCVLMUL(CallExpr *TheCall, unsigned ArgNum);
-  bool CheckRISCVBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
-                                     CallExpr *TheCall);
-  void checkRVVTypeSupport(QualType Ty, SourceLocation Loc, Decl *D,
-                           const llvm::StringMap<bool> &FeatureMap);
+
   bool CheckLoongArchBuiltinFunctionCall(const TargetInfo &TI,
                                          unsigned BuiltinID, CallExpr *TheCall);
   bool CheckWebAssemblyBuiltinFunctionCall(const TargetInfo &TI,
@@ -2139,16 +2162,6 @@ private:
   ExprResult BuiltinNontemporalOverloaded(ExprResult TheCallResult);
   ExprResult AtomicOpsOverloaded(ExprResult TheCallResult,
                                  AtomicExpr::AtomicOp Op);
-  bool BuiltinConstantArg(CallExpr *TheCall, int ArgNum, llvm::APSInt &Result);
-  bool BuiltinConstantArgRange(CallExpr *TheCall, int ArgNum, int Low, int High,
-                               bool RangeIsError = true);
-  bool BuiltinConstantArgMultiple(CallExpr *TheCall, int ArgNum,
-                                  unsigned Multiple);
-  bool BuiltinConstantArgPower2(CallExpr *TheCall, int ArgNum);
-  bool BuiltinConstantArgShiftedByte(CallExpr *TheCall, int ArgNum,
-                                     unsigned ArgBits);
-  bool BuiltinConstantArgShiftedByteOrXXFF(CallExpr *TheCall, int ArgNum,
-                                           unsigned ArgBits);
   bool BuiltinARMSpecialReg(unsigned BuiltinID, CallExpr *TheCall, int ArgNum,
                             unsigned ExpectedFieldNum, bool AllowName);
   bool BuiltinARMMemoryTaggingCall(unsigned BuiltinID, CallExpr *TheCall);
@@ -3168,13 +3181,6 @@ public:
                                     bool isDefinition, SourceLocation NewTagLoc,
                                     const IdentifierInfo *Name);
 
-  enum TagUseKind {
-    TUK_Reference,   // Reference to a tag:  'struct foo *X;'
-    TUK_Declaration, // Fwd decl of a tag:   'struct foo;'
-    TUK_Definition,  // Definition of a tag: 'struct foo { int X; } Y;'
-    TUK_Friend       // Friend declaration:  'friend struct foo;'
-  };
-
   enum OffsetOfKind {
     // Not parsing a type within __builtin_offsetof.
     OOK_Outside,
@@ -5106,6 +5112,13 @@ public:
              Context == ExpressionEvaluationContext::UnevaluatedList;
     }
 
+    bool isPotentiallyEvaluated() const {
+      return Context == ExpressionEvaluationContext::PotentiallyEvaluated ||
+             Context ==
+                 ExpressionEvaluationContext::PotentiallyEvaluatedIfUsed ||
+             Context == ExpressionEvaluationContext::ConstantEvaluated;
+    }
+
     bool isConstantEvaluated() const {
       return Context == ExpressionEvaluationContext::ConstantEvaluated ||
              Context == ExpressionEvaluationContext::ImmediateFunctionContext;
@@ -5140,6 +5153,12 @@ public:
     return ExprEvalContexts.back();
   };
 
+  const ExpressionEvaluationContextRecord &parentEvaluationContext() const {
+    assert(ExprEvalContexts.size() >= 2 &&
+           "Must be in an expression evaluation context");
+    return ExprEvalContexts[ExprEvalContexts.size() - 2];
+  };
+
   bool isBoundsAttrContext() const {
     return ExprEvalContexts.back().ExprContext ==
            ExpressionEvaluationContextRecord::ExpressionKind::
@@ -5890,7 +5909,6 @@ public:
                                        SourceLocation Loc, bool IsCompAssign);
 
   bool isValidSveBitcast(QualType srcType, QualType destType);
-  bool isValidRVVBitcast(QualType srcType, QualType destType);
 
   bool areMatrixTypesOfTheSameDimension(QualType srcTy, QualType destTy);
 
@@ -7063,7 +7081,9 @@ public:
       StorageClass SC, ArrayRef<ParmVarDecl *> Params,
       bool HasExplicitResultType);
 
-  void DiagnoseInvalidExplicitObjectParameterInLambda(CXXMethodDecl *Method);
+  /// Returns true if the explicit object parameter was invalid.
+  bool DiagnoseInvalidExplicitObjectParameterInLambda(CXXMethodDecl *Method,
+                                                      SourceLocation CallLoc);
 
   /// Perform initialization analysis of the init-capture and perform
   /// any implicit conversions such as an lvalue-to-rvalue conversion if
@@ -10067,7 +10087,9 @@ public:
 
   bool SubstTemplateArgument(const TemplateArgumentLoc &Input,
                              const MultiLevelTemplateArgumentList &TemplateArgs,
-                             TemplateArgumentLoc &Output);
+                             TemplateArgumentLoc &Output,
+                             SourceLocation Loc = {},
+                             const DeclarationName &Entity = {});
   bool
   SubstTemplateArguments(ArrayRef<TemplateArgumentLoc> Args,
                          const MultiLevelTemplateArgumentList &TemplateArgs,
@@ -11381,7 +11403,8 @@ public:
   QualType BuildMatrixType(QualType T, Expr *NumRows, Expr *NumColumns,
                            SourceLocation AttrLoc);
 
-  QualType BuildCountAttributedArrayType(QualType WrappedTy, Expr *CountExpr);
+  QualType BuildCountAttributedArrayOrPointerType(QualType WrappedTy,
+                                                  Expr *CountExpr);
 
   QualType BuildAddressSpaceAttr(QualType &T, LangAS ASIdx, Expr *AddrSpace,
                                  SourceLocation AttrLoc);
@@ -11685,27 +11708,6 @@ public:
   void ProcessAPINotes(Decl *D);
 
   ///@}
-  //
-  //
-  // -------------------------------------------------------------------------
-  //
-  //
-
-  /// \name Name Lookup for RISC-V Vector Intrinsic
-  /// Implementations are in SemaRISCVVectorLookup.cpp
-  ///@{
-
-public:
-  /// Indicate RISC-V vector builtin functions enabled or not.
-  bool DeclareRISCVVBuiltins = false;
-
-  /// Indicate RISC-V SiFive vector builtin functions enabled or not.
-  bool DeclareRISCVSiFiveVectorBuiltins = false;
-
-private:
-  std::unique_ptr<sema::RISCVIntrinsicManager> RVIntrinsicManager;
-
-  ///@}
 };
 
 DeductionFailureInfo
@@ -11727,9 +11729,6 @@ void Sema::PragmaStack<Sema::AlignPackInfo>::Act(SourceLocation PragmaLocation,
                                                  PragmaMsStackAction Action,
                                                  llvm::StringRef StackSlotLabel,
                                                  AlignPackInfo Value);
-
-std::unique_ptr<sema::RISCVIntrinsicManager>
-CreateRISCVIntrinsicManager(Sema &S);
 } // end namespace clang
 
 #endif
diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index 9927459bbc59..51981e1c9a8b 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -1390,9 +1390,7 @@ private:
   bool checkTransformableLoopNest(
       OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops,
       SmallVectorImpl<OMPLoopBasedDirective::HelperExprs> &LoopHelpers,
-      Stmt *&Body,
-      SmallVectorImpl<SmallVector<llvm::PointerUnion<Stmt *, Decl *>, 0>>
-          &OriginalInits);
+      Stmt *&Body, SmallVectorImpl<SmallVector<Stmt *, 0>> &OriginalInits);
 
   /// Helper to keep information about the current `omp begin/end declare
   /// variant` nesting.
diff --git a/clang/include/clang/Sema/SemaRISCV.h b/clang/include/clang/Sema/SemaRISCV.h
new file mode 100644
index 000000000000..b6dd81f8d4d8
--- /dev/null
+++ b/clang/include/clang/Sema/SemaRISCV.h
@@ -0,0 +1,52 @@
+//===----- SemaRISCV.h ---- RISC-V target-specific routines ---*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares semantic analysis functions specific to RISC-V.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_SEMA_SEMARISCV_H
+#define LLVM_CLANG_SEMA_SEMARISCV_H
+
+#include "clang/AST/DeclBase.h"
+#include "clang/AST/Expr.h"
+#include "clang/AST/Type.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/TargetInfo.h"
+#include "clang/Sema/RISCVIntrinsicManager.h"
+#include "clang/Sema/SemaBase.h"
+#include "llvm/ADT/StringMap.h"
+#include <memory>
+
+namespace clang {
+class SemaRISCV : public SemaBase {
+public:
+  SemaRISCV(Sema &S);
+
+  bool CheckLMUL(CallExpr *TheCall, unsigned ArgNum);
+  bool CheckBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
+                                CallExpr *TheCall);
+  void checkRVVTypeSupport(QualType Ty, SourceLocation Loc, Decl *D,
+                           const llvm::StringMap<bool> &FeatureMap);
+
+  bool isValidRVVBitcast(QualType srcType, QualType destType);
+
+  /// Indicate RISC-V vector builtin functions enabled or not.
+  bool DeclareRVVBuiltins = false;
+
+  /// Indicate RISC-V SiFive vector builtin functions enabled or not.
+  bool DeclareSiFiveVectorBuiltins = false;
+
+  std::unique_ptr<sema::RISCVIntrinsicManager> IntrinsicManager;
+};
+
+std::unique_ptr<sema::RISCVIntrinsicManager>
+CreateRISCVIntrinsicManager(Sema &S);
+} // namespace clang
+
+#endif // LLVM_CLANG_SEMA_SEMARISCV_H
diff --git a/clang/include/clang/Sema/SemaX86.h b/clang/include/clang/Sema/SemaX86.h
new file mode 100644
index 000000000000..e322483294ec
--- /dev/null
+++ b/clang/include/clang/Sema/SemaX86.h
@@ -0,0 +1,38 @@
+//===----- SemaX86.h ------- X86 target-specific routines -----*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares semantic analysis functions specific to X86.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_SEMA_SEMAX86_H
+#define LLVM_CLANG_SEMA_SEMAX86_H
+
+#include "clang/AST/Expr.h"
+#include "clang/Basic/LLVM.h"
+#include "clang/Basic/TargetInfo.h"
+#include "clang/Sema/SemaBase.h"
+
+namespace clang {
+class SemaX86 : public SemaBase {
+public:
+  SemaX86(Sema &S);
+
+  bool CheckBuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall);
+  bool CheckBuiltinGatherScatterScale(unsigned BuiltinID, CallExpr *TheCall);
+  bool CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall);
+  bool CheckBuiltinTileArgumentsRange(CallExpr *TheCall, ArrayRef<int> ArgNums);
+  bool CheckBuiltinTileDuplicate(CallExpr *TheCall, ArrayRef<int> ArgNums);
+  bool CheckBuiltinTileRangeAndDuplicate(CallExpr *TheCall,
+                                         ArrayRef<int> ArgNums);
+  bool CheckBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
+                                CallExpr *TheCall);
+};
+} // namespace clang
+
+#endif // LLVM_CLANG_SEMA_SEMAX86_H
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index 64414e3d37f7..40f443047bd4 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -1011,6 +1011,11 @@ def FloatLoopCounter : Checker<"FloatLoopCounter">,
   Dependencies<[SecuritySyntaxChecker]>,
   Documentation<HasDocumentation>;
 
+def SetgidSetuidOrderChecker : Checker<"SetgidSetuidOrder">,
+  HelpText<"Warn on possible reversed order of 'setgid(getgid()))' and "
+           "'setuid(getuid())' (CERT: POS36-C)">,
+  Documentation<HasDocumentation>;
+
 } // end "security"
 
 let ParentPackage = ENV in {
@@ -1030,15 +1035,6 @@ let ParentPackage = ENV in {
 
 } // end "security.cert.env"
 
-let ParentPackage = POSAlpha in {
-
-  def PutenvWithAuto : Checker<"34c">,
-  HelpText<"Finds calls to the 'putenv' function which pass a pointer to "
-           "an automatic variable as the argument.">,
-  Documentation<HasDocumentation>;
-
-} // end "alpha.cert.pos"
-
 let ParentPackage = SecurityAlpha in {
 
 def ArrayBoundChecker : Checker<"ArrayBound">,
@@ -1049,10 +1045,6 @@ def ArrayBoundCheckerV2 : Checker<"ArrayBoundV2">,
   HelpText<"Warn about buffer overflows (newer checker)">,
   Documentation<HasDocumentation>;
 
-def ReturnPointerRangeChecker : Checker<"ReturnPtrRange">,
-  HelpText<"Check for an out-of-bound pointer being returned to callers">,
-  Documentation<HasDocumentation>;
-
 def MallocOverflowSecurityChecker : Checker<"MallocOverflow">,
   HelpText<"Check for overflows in the arguments to malloc()">,
   Documentation<HasDocumentation>;
@@ -1073,6 +1065,15 @@ def MmapWriteExecChecker : Checker<"MmapWriteExec">,
   ]>,
   Documentation<HasDocumentation>;
 
+def PutenvStackArray : Checker<"PutenvStackArray">,
+  HelpText<"Finds calls to the function 'putenv' which pass a pointer to "
+           "an automatic (stack-allocated) array as the argument.">,
+  Documentation<HasDocumentation>;
+
+def ReturnPointerRangeChecker : Checker<"ReturnPtrRange">,
+  HelpText<"Check for an out-of-bound pointer being returned to callers">,
+  Documentation<HasDocumentation>;
+
 } // end "alpha.security"
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 52eab5feb062..a2398fef623e 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -6494,7 +6494,8 @@ bool ASTContext::isSameDefaultTemplateArgument(const NamedDecl *X,
     if (!TTPX->hasDefaultArgument() || !TTPY->hasDefaultArgument())
       return false;
 
-    return hasSameType(TTPX->getDefaultArgument(), TTPY->getDefaultArgument());
+    return hasSameType(TTPX->getDefaultArgument().getArgument().getAsType(),
+                       TTPY->getDefaultArgument().getArgument().getAsType());
   }
 
   if (auto *NTTPX = dyn_cast<NonTypeTemplateParmDecl>(X)) {
@@ -6502,8 +6503,10 @@ bool ASTContext::isSameDefaultTemplateArgument(const NamedDecl *X,
     if (!NTTPX->hasDefaultArgument() || !NTTPY->hasDefaultArgument())
       return false;
 
-    Expr *DefaultArgumentX = NTTPX->getDefaultArgument()->IgnoreImpCasts();
-    Expr *DefaultArgumentY = NTTPY->getDefaultArgument()->IgnoreImpCasts();
+    Expr *DefaultArgumentX =
+        NTTPX->getDefaultArgument().getArgument().getAsExpr()->IgnoreImpCasts();
+    Expr *DefaultArgumentY =
+        NTTPY->getDefaultArgument().getArgument().getAsExpr()->IgnoreImpCasts();
     llvm::FoldingSetNodeID XID, YID;
     DefaultArgumentX->Profile(XID, *this, /*Canonical=*/true);
     DefaultArgumentY->Profile(YID, *this, /*Canonical=*/true);
diff --git a/clang/lib/AST/ASTDiagnostic.cpp b/clang/lib/AST/ASTDiagnostic.cpp
index 7b0d5f9cc1a9..0680ff5e3a38 100644
--- a/clang/lib/AST/ASTDiagnostic.cpp
+++ b/clang/lib/AST/ASTDiagnostic.cpp
@@ -1215,46 +1215,19 @@ class TemplateDiff {
                                              bool &NeedAddressOf) {
     if (!Iter.isEnd()) {
       switch (Iter->getKind()) {
-        default:
-          llvm_unreachable("unknown ArgumentKind");
-        case TemplateArgument::Integral:
-          Value = Iter->getAsIntegral();
-          HasInt = true;
-          IntType = Iter->getIntegralType();
-          return;
-        case TemplateArgument::Declaration: {
-          VD = Iter->getAsDecl();
-          QualType ArgType = Iter->getParamTypeForDecl();
-          QualType VDType = VD->getType();
-          if (ArgType->isPointerType() &&
-              Context.hasSameType(ArgType->getPointeeType(), VDType))
-            NeedAddressOf = true;
-          return;
-        }
-        case TemplateArgument::NullPtr:
-          IsNullPtr = true;
-          return;
-        case TemplateArgument::Expression:
-          E = Iter->getAsExpr();
-      }
-    } else if (!Default->isParameterPack()) {
-      E = Default->getDefaultArgument();
-    }
-
-    if (!Iter.hasDesugaredTA()) return;
-
-    const TemplateArgument& TA = Iter.getDesugaredTA();
-    switch (TA.getKind()) {
-      default:
-        llvm_unreachable("unknown ArgumentKind");
+      case TemplateArgument::StructuralValue:
+        // FIXME: Diffing of structural values is not implemented.
+        // There is no possible fallback in this case, this will show up
+        // as '(no argument)'.
+        return;
       case TemplateArgument::Integral:
-        Value = TA.getAsIntegral();
+        Value = Iter->getAsIntegral();
         HasInt = true;
-        IntType = TA.getIntegralType();
+        IntType = Iter->getIntegralType();
         return;
       case TemplateArgument::Declaration: {
-        VD = TA.getAsDecl();
-        QualType ArgType = TA.getParamTypeForDecl();
+        VD = Iter->getAsDecl();
+        QualType ArgType = Iter->getParamTypeForDecl();
         QualType VDType = VD->getType();
         if (ArgType->isPointerType() &&
             Context.hasSameType(ArgType->getPointeeType(), VDType))
@@ -1265,13 +1238,62 @@ class TemplateDiff {
         IsNullPtr = true;
         return;
       case TemplateArgument::Expression:
-        // TODO: Sometimes, the desugared template argument Expr differs from
-        // the sugared template argument Expr.  It may be useful in the future
-        // but for now, it is just discarded.
-        if (!E)
-          E = TA.getAsExpr();
-        return;
+        E = Iter->getAsExpr();
+        break;
+      case TemplateArgument::Null:
+      case TemplateArgument::Type:
+      case TemplateArgument::Template:
+      case TemplateArgument::TemplateExpansion:
+        llvm_unreachable("TemplateArgument kind is not expected for NTTP");
+      case TemplateArgument::Pack:
+        llvm_unreachable("TemplateArgument kind should be handled elsewhere");
+      }
+    } else if (!Default->isParameterPack()) {
+      E = Default->getDefaultArgument().getArgument().getAsExpr();
     }
+
+    if (!Iter.hasDesugaredTA())
+      return;
+
+    const TemplateArgument &TA = Iter.getDesugaredTA();
+    switch (TA.getKind()) {
+    case TemplateArgument::StructuralValue:
+      // FIXME: Diffing of structural values is not implemented.
+      //        Just fall back to the expression.
+      return;
+    case TemplateArgument::Integral:
+      Value = TA.getAsIntegral();
+      HasInt = true;
+      IntType = TA.getIntegralType();
+      return;
+    case TemplateArgument::Declaration: {
+      VD = TA.getAsDecl();
+      QualType ArgType = TA.getParamTypeForDecl();
+      QualType VDType = VD->getType();
+      if (ArgType->isPointerType() &&
+          Context.hasSameType(ArgType->getPointeeType(), VDType))
+        NeedAddressOf = true;
+      return;
+    }
+    case TemplateArgument::NullPtr:
+      IsNullPtr = true;
+      return;
+    case TemplateArgument::Expression:
+      // TODO: Sometimes, the desugared template argument Expr differs from
+      // the sugared template argument Expr.  It may be useful in the future
+      // but for now, it is just discarded.
+      if (!E)
+        E = TA.getAsExpr();
+      return;
+    case TemplateArgument::Null:
+    case TemplateArgument::Type:
+    case TemplateArgument::Template:
+    case TemplateArgument::TemplateExpansion:
+      llvm_unreachable("TemplateArgument kind is not expected for NTTP");
+    case TemplateArgument::Pack:
+      llvm_unreachable("TemplateArgument kind should be handled elsewhere");
+    }
+    llvm_unreachable("Unexpected TemplateArgument kind");
   }
 
   /// DiffNonTypes - Handles any template parameters not handled by DiffTypes
@@ -1914,6 +1936,11 @@ class TemplateDiff {
       return;
     }
 
+    if (E) {
+      PrintExpr(E);
+      return;
+    }
+
     OS << "(no argument)";
   }
 
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 9ff8e1ea78d8..cab5ee604795 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -5917,11 +5917,11 @@ ASTNodeImporter::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
   }
 
   if (D->hasDefaultArgument()) {
-    Expected<TypeSourceInfo *> ToDefaultArgOrErr =
-        import(D->getDefaultArgumentInfo());
+    Expected<TemplateArgumentLoc> ToDefaultArgOrErr =
+        import(D->getDefaultArgument());
     if (!ToDefaultArgOrErr)
       return ToDefaultArgOrErr.takeError();
-    ToD->setDefaultArgument(*ToDefaultArgOrErr);
+    ToD->setDefaultArgument(ToD->getASTContext(), *ToDefaultArgOrErr);
   }
 
   return ToD;
@@ -5949,10 +5949,11 @@ ASTNodeImporter::VisitNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D) {
     return ToD;
 
   if (D->hasDefaultArgument()) {
-    ExpectedExpr ToDefaultArgOrErr = import(D->getDefaultArgument());
+    Expected<TemplateArgumentLoc> ToDefaultArgOrErr =
+        import(D->getDefaultArgument());
     if (!ToDefaultArgOrErr)
       return ToDefaultArgOrErr.takeError();
-    ToD->setDefaultArgument(*ToDefaultArgOrErr);
+    ToD->setDefaultArgument(Importer.getToContext(), *ToDefaultArgOrErr);
   }
 
   return ToD;
diff --git a/clang/lib/AST/DeclPrinter.cpp b/clang/lib/AST/DeclPrinter.cpp
index c5868256b440..0cf4e64f83b8 100644
--- a/clang/lib/AST/DeclPrinter.cpp
+++ b/clang/lib/AST/DeclPrinter.cpp
@@ -1883,7 +1883,8 @@ void DeclPrinter::VisitTemplateTypeParmDecl(const TemplateTypeParmDecl *TTP) {
 
   if (TTP->hasDefaultArgument()) {
     Out << " = ";
-    Out << TTP->getDefaultArgument().getAsString(Policy);
+    TTP->getDefaultArgument().getArgument().print(Policy, Out,
+                                                  /*IncludeType=*/false);
   }
 }
 
@@ -1897,7 +1898,7 @@ void DeclPrinter::VisitNonTypeTemplateParmDecl(
 
   if (NTTP->hasDefaultArgument()) {
     Out << " = ";
-    NTTP->getDefaultArgument()->printPretty(Out, nullptr, Policy, Indentation,
-                                            "\n", &Context);
+    NTTP->getDefaultArgument().getArgument().print(Policy, Out,
+                                                   /*IncludeType=*/false);
   }
 }
diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp
index 26765a5da1dc..95ffd4784641 100644
--- a/clang/lib/AST/DeclTemplate.cpp
+++ b/clang/lib/AST/DeclTemplate.cpp
@@ -669,23 +669,30 @@ TemplateTypeParmDecl::CreateDeserialized(const ASTContext &C, GlobalDeclID ID,
 }
 
 SourceLocation TemplateTypeParmDecl::getDefaultArgumentLoc() const {
-  return hasDefaultArgument()
-             ? getDefaultArgumentInfo()->getTypeLoc().getBeginLoc()
-             : SourceLocation();
+  return hasDefaultArgument() ? getDefaultArgument().getLocation()
+                              : SourceLocation();
 }
 
 SourceRange TemplateTypeParmDecl::getSourceRange() const {
   if (hasDefaultArgument() && !defaultArgumentWasInherited())
     return SourceRange(getBeginLoc(),
-                       getDefaultArgumentInfo()->getTypeLoc().getEndLoc());
+                       getDefaultArgument().getSourceRange().getEnd());
   // TypeDecl::getSourceRange returns a range containing name location, which is
   // wrong for unnamed template parameters. e.g:
   // it will return <[[typename>]] instead of <[[typename]]>
-  else if (getDeclName().isEmpty())
+  if (getDeclName().isEmpty())
     return SourceRange(getBeginLoc());
   return TypeDecl::getSourceRange();
 }
 
+void TemplateTypeParmDecl::setDefaultArgument(
+    const ASTContext &C, const TemplateArgumentLoc &DefArg) {
+  if (DefArg.getArgument().isNull())
+    DefaultArgument.set(nullptr);
+  else
+    DefaultArgument.set(new (C) TemplateArgumentLoc(DefArg));
+}
+
 unsigned TemplateTypeParmDecl::getDepth() const {
   return getTypeForDecl()->castAs<TemplateTypeParmType>()->getDepth();
 }
@@ -788,14 +795,21 @@ NonTypeTemplateParmDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID,
 SourceRange NonTypeTemplateParmDecl::getSourceRange() const {
   if (hasDefaultArgument() && !defaultArgumentWasInherited())
     return SourceRange(getOuterLocStart(),
-                       getDefaultArgument()->getSourceRange().getEnd());
+                       getDefaultArgument().getSourceRange().getEnd());
   return DeclaratorDecl::getSourceRange();
 }
 
 SourceLocation NonTypeTemplateParmDecl::getDefaultArgumentLoc() const {
-  return hasDefaultArgument()
-    ? getDefaultArgument()->getSourceRange().getBegin()
-    : SourceLocation();
+  return hasDefaultArgument() ? getDefaultArgument().getSourceRange().getBegin()
+                              : SourceLocation();
+}
+
+void NonTypeTemplateParmDecl::setDefaultArgument(
+    const ASTContext &C, const TemplateArgumentLoc &DefArg) {
+  if (DefArg.getArgument().isNull())
+    DefaultArgument.set(nullptr);
+  else
+    DefaultArgument.set(new (C) TemplateArgumentLoc(DefArg));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 859a3fabea32..6607727b5246 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -1050,34 +1050,85 @@ bool ByteCodeExprGen<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
   if (T->isRecordType()) {
     const Record *R = getRecord(E->getType());
 
-    if (Inits.size() == 1 && E->getType() == Inits[0]->getType()) {
+    if (Inits.size() == 1 && E->getType() == Inits[0]->getType())
       return this->visitInitializer(Inits[0]);
+
+    auto initPrimitiveField = [=](const Record::Field *FieldToInit,
+                                  const Expr *Init, PrimType T) -> bool {
+      if (!this->visit(Init))
+        return false;
+
+      if (FieldToInit->isBitField()) {
+        if (!this->emitInitBitField(T, FieldToInit, E))
+          return false;
+      } else {
+        if (!this->emitInitField(T, FieldToInit->Offset, E))
+          return false;
+      }
+      return this->emitPopPtr(E);
+    };
+
+    auto initCompositeField = [=](const Record::Field *FieldToInit,
+                                  const Expr *Init) -> bool {
+      // Non-primitive case. Get a pointer to the field-to-initialize
+      // on the stack and recurse into visitInitializer().
+      if (!this->emitGetPtrField(FieldToInit->Offset, Init))
+        return false;
+      if (!this->visitInitializer(Init))
+        return false;
+      return this->emitPopPtr(E);
+    };
+
+    if (R->isUnion()) {
+      if (Inits.size() == 0) {
+        // Zero-initialize the first union field.
+        if (R->getNumFields() == 0)
+          return this->emitFinishInit(E);
+        const Record::Field *FieldToInit = R->getField(0u);
+        QualType FieldType = FieldToInit->Desc->getType();
+        if (std::optional<PrimType> T = classify(FieldType)) {
+          if (!this->visitZeroInitializer(*T, FieldType, E))
+            return false;
+          if (!this->emitInitField(*T, FieldToInit->Offset, E))
+            return false;
+        }
+        // FIXME: Non-primitive case?
+      } else {
+        const Expr *Init = Inits[0];
+        const FieldDecl *FToInit = nullptr;
+        if (const auto *ILE = dyn_cast<InitListExpr>(E))
+          FToInit = ILE->getInitializedFieldInUnion();
+        else
+          FToInit = cast<CXXParenListInitExpr>(E)->getInitializedFieldInUnion();
+
+        if (!this->emitDupPtr(E))
+          return false;
+
+        const Record::Field *FieldToInit = R->getField(FToInit);
+        if (std::optional<PrimType> T = classify(Init)) {
+          if (!initPrimitiveField(FieldToInit, Init, *T))
+            return false;
+        } else {
+          if (!initCompositeField(FieldToInit, Init))
+            return false;
+        }
+      }
+      return this->emitFinishInit(E);
     }
 
+    assert(!R->isUnion());
     unsigned InitIndex = 0;
     for (const Expr *Init : Inits) {
       // Skip unnamed bitfields.
       while (InitIndex < R->getNumFields() &&
              R->getField(InitIndex)->Decl->isUnnamedBitField())
         ++InitIndex;
-
       if (!this->emitDupPtr(E))
         return false;
 
       if (std::optional<PrimType> T = classify(Init)) {
         const Record::Field *FieldToInit = R->getField(InitIndex);
-        if (!this->visit(Init))
-          return false;
-
-        if (FieldToInit->isBitField()) {
-          if (!this->emitInitBitField(*T, FieldToInit, E))
-            return false;
-        } else {
-          if (!this->emitInitField(*T, FieldToInit->Offset, E))
-            return false;
-        }
-
-        if (!this->emitPopPtr(E))
+        if (!initPrimitiveField(FieldToInit, Init, *T))
           return false;
         ++InitIndex;
       } else {
@@ -1095,21 +1146,13 @@ bool ByteCodeExprGen<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
           // into the Record's fields.
         } else {
           const Record::Field *FieldToInit = R->getField(InitIndex);
-          // Non-primitive case. Get a pointer to the field-to-initialize
-          // on the stack and recurse into visitInitializer().
-          if (!this->emitGetPtrField(FieldToInit->Offset, Init))
-            return false;
-
-          if (!this->visitInitializer(Init))
-            return false;
-
-          if (!this->emitPopPtr(E))
+          if (!initCompositeField(FieldToInit, Init))
             return false;
           ++InitIndex;
         }
       }
     }
-    return true;
+    return this->emitFinishInit(E);
   }
 
   if (T->isArrayType()) {
@@ -1133,7 +1176,7 @@ bool ByteCodeExprGen<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
       }
     }
 
-    return true;
+    return this->emitFinishInit(E);
   }
 
   if (const auto *ComplexTy = E->getType()->getAs<ComplexType>()) {
@@ -3752,7 +3795,8 @@ bool ByteCodeExprGen<Emitter>::VisitDeclRefExpr(const DeclRefExpr *E) {
     }
   } else {
     if (const auto *VD = dyn_cast<VarDecl>(D);
-        VD && VD->getAnyInitializer() && VD->getType().isConstQualified()) {
+        VD && VD->getAnyInitializer() && VD->getType().isConstQualified() &&
+        !VD->isWeak()) {
       if (!this->visitVarDecl(VD))
         return false;
       // Retry.
@@ -3763,6 +3807,8 @@ bool ByteCodeExprGen<Emitter>::VisitDeclRefExpr(const DeclRefExpr *E) {
   if (std::optional<unsigned> I = P.getOrCreateDummy(D)) {
     if (!this->emitGetPtrGlobal(*I, E))
       return false;
+    if (E->getType()->isVoidType())
+      return true;
     // Convert the dummy pointer to another pointer type if we have to.
     if (PrimType PT = classifyPrim(E); PT != PT_Ptr) {
       if (!this->emitDecayPtr(PT_Ptr, PT, E))
diff --git a/clang/lib/AST/Interp/Descriptor.cpp b/clang/lib/AST/Interp/Descriptor.cpp
index d0466902247b..746b765ca421 100644
--- a/clang/lib/AST/Interp/Descriptor.cpp
+++ b/clang/lib/AST/Interp/Descriptor.cpp
@@ -137,9 +137,8 @@ static void moveArrayDesc(Block *B, const std::byte *Src, std::byte *Dst,
 }
 
 static void initField(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
-                      bool IsActive, const Descriptor *D,
+                      bool IsActive, bool IsUnion, const Descriptor *D,
                       unsigned FieldOffset) {
-  bool IsUnion = false; // FIXME
   auto *Desc = reinterpret_cast<InlineDescriptor *>(Ptr + FieldOffset) - 1;
   Desc->Offset = FieldOffset;
   Desc->Desc = D;
@@ -174,7 +173,7 @@ static void initBase(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
     initBase(B, Ptr + FieldOffset, IsConst, IsMutable, IsActive, V.Desc,
              V.Offset, false);
   for (const auto &F : D->ElemRecord->fields())
-    initField(B, Ptr + FieldOffset, IsConst, IsMutable, IsActive, F.Desc,
+    initField(B, Ptr + FieldOffset, IsConst, IsMutable, IsActive, IsUnion, F.Desc,
               F.Offset);
 
   // If this is initializing a virtual base, we do NOT want to consider its
@@ -193,7 +192,7 @@ static void ctorRecord(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable,
   for (const auto &V : D->ElemRecord->bases())
     initBase(B, Ptr, IsConst, IsMutable, IsActive, V.Desc, V.Offset, false);
   for (const auto &F : D->ElemRecord->fields())
-    initField(B, Ptr, IsConst, IsMutable, IsActive, F.Desc, F.Offset);
+    initField(B, Ptr, IsConst, IsMutable, IsActive, D->ElemRecord->isUnion(), F.Desc, F.Offset);
   for (const auto &V : D->ElemRecord->virtual_bases())
     initBase(B, Ptr, IsConst, IsMutable, IsActive, V.Desc, V.Offset, true);
 }
diff --git a/clang/lib/AST/Interp/EvaluationResult.cpp b/clang/lib/AST/Interp/EvaluationResult.cpp
index e92d686c724c..150a793da881 100644
--- a/clang/lib/AST/Interp/EvaluationResult.cpp
+++ b/clang/lib/AST/Interp/EvaluationResult.cpp
@@ -101,6 +101,10 @@ static bool CheckFieldsInitialized(InterpState &S, SourceLocation Loc,
     Pointer FieldPtr = BasePtr.atField(F.Offset);
     QualType FieldType = F.Decl->getType();
 
+    // Don't check inactive union members.
+    if (R->isUnion() && !FieldPtr.isActive())
+      continue;
+
     if (FieldType->isRecordType()) {
       Result &= CheckFieldsInitialized(S, Loc, FieldPtr, FieldPtr.getRecord());
     } else if (FieldType->isIncompleteArrayType()) {
diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp
index 3e4da487e43c..145fa65791da 100644
--- a/clang/lib/AST/Interp/Interp.cpp
+++ b/clang/lib/AST/Interp/Interp.cpp
@@ -18,6 +18,7 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTDiagnostic.h"
 #include "clang/AST/CXXInheritance.h"
+#include "clang/AST/DeclObjC.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/ExprCXX.h"
 #include "llvm/ADT/APSInt.h"
@@ -76,18 +77,15 @@ static bool diagnoseUnknownDecl(InterpState &S, CodePtr OpPC,
     } else {
       S.FFDiag(E);
     }
-  } else if (const auto *VD = dyn_cast<VarDecl>(D)) {
-    if (!VD->getType().isConstQualified()) {
-      diagnoseNonConstVariable(S, OpPC, VD);
-      return false;
-    }
-
-    // const, but no initializer.
-    if (!VD->getAnyInitializer()) {
-      diagnoseMissingInitializer(S, OpPC, VD);
-      return false;
-    }
+    return false;
   }
+
+  if (!D->getType().isConstQualified())
+    diagnoseNonConstVariable(S, OpPC, D);
+  else if (const auto *VD = dyn_cast<VarDecl>(D);
+           VD && !VD->getAnyInitializer())
+    diagnoseMissingInitializer(S, OpPC, VD);
+
   return false;
 }
 
@@ -104,6 +102,11 @@ static void diagnoseNonConstVariable(InterpState &S, CodePtr OpPC,
     return;
   }
 
+  // Rather random, but this is to match the diagnostic output of the current
+  // interpreter.
+  if (isa<ObjCIvarDecl>(VD))
+    return;
+
   if (VD->getType()->isIntegralOrEnumerationType()) {
     S.FFDiag(Loc, diag::note_constexpr_ltor_non_const_int, 1) << VD;
     S.Note(VD->getLocation(), diag::note_declared_at);
@@ -454,16 +457,16 @@ bool CheckLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   if (!CheckConstant(S, OpPC, Ptr))
     return false;
 
-  if (!CheckDummy(S, OpPC, Ptr))
+  if (!CheckDummy(S, OpPC, Ptr, AK_Read))
     return false;
   if (!CheckExtern(S, OpPC, Ptr))
     return false;
   if (!CheckRange(S, OpPC, Ptr, AK_Read))
     return false;
-  if (!CheckInitialized(S, OpPC, Ptr, AK_Read))
-    return false;
   if (!CheckActive(S, OpPC, Ptr, AK_Read))
     return false;
+  if (!CheckInitialized(S, OpPC, Ptr, AK_Read))
+    return false;
   if (!CheckTemporary(S, OpPC, Ptr, AK_Read))
     return false;
   if (!CheckMutable(S, OpPC, Ptr))
@@ -474,7 +477,7 @@ bool CheckLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
 bool CheckStore(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   if (!CheckLive(S, OpPC, Ptr, AK_Assign))
     return false;
-  if (!CheckDummy(S, OpPC, Ptr))
+  if (!CheckDummy(S, OpPC, Ptr, AK_Assign))
     return false;
   if (!CheckExtern(S, OpPC, Ptr))
     return false;
@@ -657,7 +660,8 @@ bool CheckDeclRef(InterpState &S, CodePtr OpPC, const DeclRefExpr *DR) {
   return diagnoseUnknownDecl(S, OpPC, D);
 }
 
-bool CheckDummy(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
+bool CheckDummy(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
+                AccessKinds AK) {
   if (!Ptr.isDummy())
     return true;
 
@@ -666,7 +670,15 @@ bool CheckDummy(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
   if (!D)
     return false;
 
-  return diagnoseUnknownDecl(S, OpPC, D);
+  if (AK == AK_Read || AK == AK_Increment || AK == AK_Decrement)
+    return diagnoseUnknownDecl(S, OpPC, D);
+
+  assert(AK == AK_Assign);
+  if (S.getLangOpts().CPlusPlus11) {
+    const SourceInfo &E = S.Current->getSource(OpPC);
+    S.FFDiag(E, diag::note_constexpr_modify_global);
+  }
+  return false;
 }
 
 bool CheckNonNullArgs(InterpState &S, CodePtr OpPC, const Function *F,
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index bc2ca126ce36..eca1792e6471 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -56,7 +56,8 @@ bool CheckLive(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                AccessKinds AK);
 
 /// Checks if a pointer is a dummy pointer.
-bool CheckDummy(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
+bool CheckDummy(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
+                AccessKinds AK);
 
 /// Checks if a pointer is null.
 bool CheckNull(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
@@ -588,7 +589,7 @@ bool IncDecHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Inc(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-  if (!CheckDummy(S, OpPC, Ptr))
+  if (!CheckDummy(S, OpPC, Ptr, AK_Increment))
     return false;
   if (!CheckInitialized(S, OpPC, Ptr, AK_Increment))
     return false;
@@ -602,7 +603,7 @@ bool Inc(InterpState &S, CodePtr OpPC) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool IncPop(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-  if (!CheckDummy(S, OpPC, Ptr))
+  if (!CheckDummy(S, OpPC, Ptr, AK_Increment))
     return false;
   if (!CheckInitialized(S, OpPC, Ptr, AK_Increment))
     return false;
@@ -617,7 +618,7 @@ bool IncPop(InterpState &S, CodePtr OpPC) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool Dec(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-  if (!CheckDummy(S, OpPC, Ptr))
+  if (!CheckDummy(S, OpPC, Ptr, AK_Decrement))
     return false;
   if (!CheckInitialized(S, OpPC, Ptr, AK_Decrement))
     return false;
@@ -631,7 +632,7 @@ bool Dec(InterpState &S, CodePtr OpPC) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool DecPop(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-  if (!CheckDummy(S, OpPC, Ptr))
+  if (!CheckDummy(S, OpPC, Ptr, AK_Decrement))
     return false;
   if (!CheckInitialized(S, OpPC, Ptr, AK_Decrement))
     return false;
@@ -1335,16 +1336,19 @@ inline bool GetPtrThisBase(InterpState &S, CodePtr OpPC, uint32_t Off) {
 
 inline bool FinishInitPop(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-  if (Ptr.canBeInitialized())
+  if (Ptr.canBeInitialized()) {
     Ptr.initialize();
+    Ptr.activate();
+  }
   return true;
 }
 
 inline bool FinishInit(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
-
-  if (Ptr.canBeInitialized())
+  if (Ptr.canBeInitialized()) {
     Ptr.initialize();
+    Ptr.activate();
+  }
   return true;
 }
 
@@ -1370,9 +1374,6 @@ inline bool GetPtrVirtBasePop(InterpState &S, CodePtr OpPC,
   const Pointer &Ptr = S.Stk.pop<Pointer>();
   if (!CheckNull(S, OpPC, Ptr, CSK_Base))
     return false;
-  if (Ptr.isDummy()) // FIXME: Once we have type info for dummy pointers, this
-                     // needs to go.
-    return false;
   return VirtBaseHelper(S, OpPC, D, Ptr);
 }
 
@@ -1538,9 +1539,6 @@ inline bool Memcpy(InterpState &S, CodePtr OpPC) {
 template <class T, ArithOp Op>
 bool OffsetHelper(InterpState &S, CodePtr OpPC, const T &Offset,
                   const Pointer &Ptr) {
-  if (!CheckRange(S, OpPC, Ptr, CSK_ArrayToPointer))
-    return false;
-
   // A zero offset does not change the pointer.
   if (Offset.isZero()) {
     S.Stk.push<Pointer>(Ptr);
@@ -1558,8 +1556,12 @@ bool OffsetHelper(InterpState &S, CodePtr OpPC, const T &Offset,
   if (!CheckArray(S, OpPC, Ptr))
     return false;
 
-  uint64_t Index = Ptr.getIndex();
   uint64_t MaxIndex = static_cast<uint64_t>(Ptr.getNumElems());
+  uint64_t Index;
+  if (Ptr.isOnePastEnd())
+    Index = MaxIndex;
+  else
+    Index = Ptr.getIndex();
 
   bool Invalid = false;
   // Helper to report an invalid offset, computed as APSInt.
diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp
index 565c85bc2e0c..00206d09c113 100644
--- a/clang/lib/AST/Interp/InterpBuiltin.cpp
+++ b/clang/lib/AST/Interp/InterpBuiltin.cpp
@@ -214,7 +214,7 @@ static bool interp__builtin_strlen(InterpState &S, CodePtr OpPC,
   if (!CheckLive(S, OpPC, StrPtr, AK_Read))
     return false;
 
-  if (!CheckDummy(S, OpPC, StrPtr))
+  if (!CheckDummy(S, OpPC, StrPtr, AK_Read))
     return false;
 
   assert(StrPtr.getFieldDesc()->isPrimitiveArray());
diff --git a/clang/lib/AST/Interp/Pointer.cpp b/clang/lib/AST/Interp/Pointer.cpp
index ee8cedccb8d4..252f7ea46086 100644
--- a/clang/lib/AST/Interp/Pointer.cpp
+++ b/clang/lib/AST/Interp/Pointer.cpp
@@ -144,13 +144,18 @@ APValue Pointer::toAPValue() const {
 
   // TODO: compute the offset into the object.
   CharUnits Offset = CharUnits::Zero();
-  bool IsOnePastEnd = isOnePastEnd();
 
   // Build the path into the object.
   Pointer Ptr = *this;
   while (Ptr.isField() || Ptr.isArrayElement()) {
-    if (Ptr.isArrayElement()) {
-      Path.push_back(APValue::LValuePathEntry::ArrayIndex(Ptr.getIndex()));
+    if (Ptr.isArrayRoot()) {
+        Path.push_back(APValue::LValuePathEntry::ArrayIndex(0));
+        Ptr = Ptr.getBase();
+    } else if (Ptr.isArrayElement()) {
+      if (Ptr.isOnePastEnd())
+        Path.push_back(APValue::LValuePathEntry::ArrayIndex(Ptr.getArray().getNumElems()));
+      else
+        Path.push_back(APValue::LValuePathEntry::ArrayIndex(Ptr.getIndex()));
       Ptr = Ptr.getArray();
     } else {
       // TODO: figure out if base is virtual
@@ -173,7 +178,7 @@ APValue Pointer::toAPValue() const {
   // Just invert the order of the elements.
   std::reverse(Path.begin(), Path.end());
 
-  return APValue(Base, Offset, Path, IsOnePastEnd, /*IsNullPtr=*/false);
+  return APValue(Base, Offset, Path, /*IsOnePastEnd=*/false, /*IsNullPtr=*/false);
 }
 
 void Pointer::print(llvm::raw_ostream &OS) const {
@@ -346,6 +351,7 @@ std::optional<APValue> Pointer::toRValue(const Context &Ctx) const {
             } else {
               Ok &= Composite(FieldTy, FP, Value);
             }
+            ActiveField = FP.getFieldDesc()->asFieldDecl();
             break;
           }
         }
diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h
index 3ade5756e580..93ca754d04a6 100644
--- a/clang/lib/AST/Interp/Pointer.h
+++ b/clang/lib/AST/Interp/Pointer.h
@@ -314,12 +314,14 @@ public:
   /// Returns the type of the innermost field.
   QualType getType() const {
     if (inPrimitiveArray() && Offset != asBlockPointer().Base) {
-      // Unfortunately, complex types are not array types in clang, but they are
-      // for us.
+      // Unfortunately, complex and vector types are not array types in clang,
+      // but they are for us.
       if (const auto *AT = getFieldDesc()->getType()->getAsArrayTypeUnsafe())
         return AT->getElementType();
       if (const auto *CT = getFieldDesc()->getType()->getAs<ComplexType>())
         return CT->getElementType();
+      if (const auto *CT = getFieldDesc()->getType()->getAs<VectorType>())
+        return CT->getElementType();
     }
     return getFieldDesc()->getType();
   }
@@ -535,9 +537,6 @@ public:
     if (isZero())
       return 0;
 
-    if (isElementPastEnd())
-      return 1;
-
     // narrow()ed element in a composite array.
     if (asBlockPointer().Base > sizeof(InlineDescriptor) &&
         asBlockPointer().Base == Offset)
diff --git a/clang/lib/AST/Interp/Record.cpp b/clang/lib/AST/Interp/Record.cpp
index 6a0a28bc9124..8ded765fc1c4 100644
--- a/clang/lib/AST/Interp/Record.cpp
+++ b/clang/lib/AST/Interp/Record.cpp
@@ -16,7 +16,7 @@ Record::Record(const RecordDecl *Decl, BaseList &&SrcBases,
                FieldList &&SrcFields, VirtualBaseList &&SrcVirtualBases,
                unsigned VirtualSize, unsigned BaseSize)
     : Decl(Decl), Bases(std::move(SrcBases)), Fields(std::move(SrcFields)),
-      BaseSize(BaseSize), VirtualSize(VirtualSize) {
+      BaseSize(BaseSize), VirtualSize(VirtualSize), IsUnion(Decl->isUnion()) {
   for (Base &V : SrcVirtualBases)
     VirtualBases.push_back({ V.Decl, V.Offset + BaseSize, V.Desc, V.R });
 
diff --git a/clang/lib/AST/Interp/Record.h b/clang/lib/AST/Interp/Record.h
index cf0480b3f62f..83e15b125f77 100644
--- a/clang/lib/AST/Interp/Record.h
+++ b/clang/lib/AST/Interp/Record.h
@@ -53,7 +53,7 @@ public:
   /// Returns the name of the underlying declaration.
   const std::string getName() const;
   /// Checks if the record is a union.
-  bool isUnion() const { return getDecl()->isUnion(); }
+  bool isUnion() const { return IsUnion; }
   /// Returns the size of the record.
   unsigned getSize() const { return BaseSize; }
   /// Returns the full size of the record, including records.
@@ -132,6 +132,8 @@ private:
   unsigned BaseSize;
   /// Size of all virtual bases.
   unsigned VirtualSize;
+  /// If this record is a union.
+  bool IsUnion;
 };
 
 } // namespace interp
diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp
index 42608476b1c1..3bbb3a905e9b 100644
--- a/clang/lib/AST/JSONNodeDumper.cpp
+++ b/clang/lib/AST/JSONNodeDumper.cpp
@@ -1028,7 +1028,7 @@ void JSONNodeDumper::VisitTemplateTypeParmDecl(const TemplateTypeParmDecl *D) {
 
   if (D->hasDefaultArgument())
     JOS.attributeObject("defaultArg", [=] {
-      Visit(D->getDefaultArgument(), SourceRange(),
+      Visit(D->getDefaultArgument().getArgument(), SourceRange(),
             D->getDefaultArgStorage().getInheritedFrom(),
             D->defaultArgumentWasInherited() ? "inherited from" : "previous");
     });
@@ -1044,7 +1044,7 @@ void JSONNodeDumper::VisitNonTypeTemplateParmDecl(
 
   if (D->hasDefaultArgument())
     JOS.attributeObject("defaultArg", [=] {
-      Visit(D->getDefaultArgument(), SourceRange(),
+      Visit(D->getDefaultArgument().getArgument(), SourceRange(),
             D->getDefaultArgStorage().getInheritedFrom(),
             D->defaultArgumentWasInherited() ? "inherited from" : "previous");
     });
diff --git a/clang/lib/AST/ODRDiagsEmitter.cpp b/clang/lib/AST/ODRDiagsEmitter.cpp
index 5b1cdc16e2ea..37f0f68c9235 100644
--- a/clang/lib/AST/ODRDiagsEmitter.cpp
+++ b/clang/lib/AST/ODRDiagsEmitter.cpp
@@ -1409,13 +1409,15 @@ bool ODRDiagsEmitter::diagnoseMismatch(
         }
 
         if (HasFirstDefaultArgument && HasSecondDefaultArgument) {
-          QualType FirstType = FirstTTPD->getDefaultArgument();
-          QualType SecondType = SecondTTPD->getDefaultArgument();
-          if (computeODRHash(FirstType) != computeODRHash(SecondType)) {
+          TemplateArgument FirstTA =
+              FirstTTPD->getDefaultArgument().getArgument();
+          TemplateArgument SecondTA =
+              SecondTTPD->getDefaultArgument().getArgument();
+          if (computeODRHash(FirstTA) != computeODRHash(SecondTA)) {
             DiagTemplateError(FunctionTemplateParameterDifferentDefaultArgument)
-                << (i + 1) << FirstType;
+                << (i + 1) << FirstTA;
             DiagTemplateNote(FunctionTemplateParameterDifferentDefaultArgument)
-                << (i + 1) << SecondType;
+                << (i + 1) << SecondTA;
             return true;
           }
         }
@@ -1521,8 +1523,11 @@ bool ODRDiagsEmitter::diagnoseMismatch(
         }
 
         if (HasFirstDefaultArgument && HasSecondDefaultArgument) {
-          Expr *FirstDefaultArgument = FirstNTTPD->getDefaultArgument();
-          Expr *SecondDefaultArgument = SecondNTTPD->getDefaultArgument();
+          TemplateArgument FirstDefaultArgument =
+              FirstNTTPD->getDefaultArgument().getArgument();
+          TemplateArgument SecondDefaultArgument =
+              SecondNTTPD->getDefaultArgument().getArgument();
+
           if (computeODRHash(FirstDefaultArgument) !=
               computeODRHash(SecondDefaultArgument)) {
             DiagTemplateError(FunctionTemplateParameterDifferentDefaultArgument)
diff --git a/clang/lib/AST/ODRHash.cpp b/clang/lib/AST/ODRHash.cpp
index 6f04739cf669..246e56231539 100644
--- a/clang/lib/AST/ODRHash.cpp
+++ b/clang/lib/AST/ODRHash.cpp
@@ -462,7 +462,7 @@ public:
         D->hasDefaultArgument() && !D->defaultArgumentWasInherited();
     Hash.AddBoolean(hasDefaultArgument);
     if (hasDefaultArgument) {
-      AddTemplateArgument(D->getDefaultArgument());
+      AddTemplateArgument(D->getDefaultArgument().getArgument());
     }
     Hash.AddBoolean(D->isParameterPack());
 
@@ -480,7 +480,7 @@ public:
         D->hasDefaultArgument() && !D->defaultArgumentWasInherited();
     Hash.AddBoolean(hasDefaultArgument);
     if (hasDefaultArgument) {
-      AddStmt(D->getDefaultArgument());
+      AddTemplateArgument(D->getDefaultArgument().getArgument());
     }
     Hash.AddBoolean(D->isParameterPack());
 
diff --git a/clang/lib/AST/ParentMap.cpp b/clang/lib/AST/ParentMap.cpp
index 3d6a1cc84c7b..534793b837bb 100644
--- a/clang/lib/AST/ParentMap.cpp
+++ b/clang/lib/AST/ParentMap.cpp
@@ -97,6 +97,22 @@ static void BuildParentMap(MapTy& M, Stmt* S,
       BuildParentMap(M, SubStmt, OVMode);
     }
     break;
+  case Stmt::CXXDefaultArgExprClass:
+    if (auto *Arg = dyn_cast<CXXDefaultArgExpr>(S)) {
+      if (Arg->hasRewrittenInit()) {
+        M[Arg->getExpr()] = S;
+        BuildParentMap(M, Arg->getExpr(), OVMode);
+      }
+    }
+    break;
+  case Stmt::CXXDefaultInitExprClass:
+    if (auto *Init = dyn_cast<CXXDefaultInitExpr>(S)) {
+      if (Init->hasRewrittenInit()) {
+        M[Init->getExpr()] = S;
+        BuildParentMap(M, Init->getExpr(), OVMode);
+      }
+    }
+    break;
   default:
     for (Stmt *SubStmt : S->children()) {
       if (SubStmt) {
diff --git a/clang/lib/AST/TemplateBase.cpp b/clang/lib/AST/TemplateBase.cpp
index 3310d7dc24c5..a7ee973b7f7d 100644
--- a/clang/lib/AST/TemplateBase.cpp
+++ b/clang/lib/AST/TemplateBase.cpp
@@ -538,9 +538,19 @@ void TemplateArgument::print(const PrintingPolicy &Policy, raw_ostream &Out,
     Out << "nullptr";
     break;
 
-  case Template:
-    getAsTemplate().print(Out, Policy, TemplateName::Qualified::Fully);
+  case Template: {
+    TemplateName TN = getAsTemplate();
+    if (const auto *TD = TN.getAsTemplateDecl();
+        TD && TD->getDeclName().isEmpty()) {
+      assert(isa<TemplateTemplateParmDecl>(TD) &&
+             "Unexpected anonymous template");
+      const auto *TTP = cast<TemplateTemplateParmDecl>(TD);
+      Out << "template-parameter-" << TTP->getDepth() << "-" << TTP->getIndex();
+    } else {
+      TN.print(Out, Policy, TemplateName::Qualified::Fully);
+    }
     break;
+  }
 
   case TemplateExpansion:
     getAsTemplateOrTemplatePattern().print(Out, Policy);
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 3b90b8229dd1..04f105c12887 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -632,6 +632,16 @@ bool Type::isStructureType() const {
   return false;
 }
 
+bool Type::isStructureTypeWithFlexibleArrayMember() const {
+  const auto *RT = getAs<RecordType>();
+  if (!RT)
+    return false;
+  const auto *Decl = RT->getDecl();
+  if (!Decl->isStruct())
+    return false;
+  return Decl->hasFlexibleArrayMember();
+}
+
 bool Type::isObjCBoxableRecordType() const {
   if (const auto *RT = getAs<RecordType>())
     return RT->getDecl()->hasAttr<ObjCBoxableAttr>();
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index 87f0a8728d85..5ed56b367a46 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -2273,16 +2273,17 @@ bool clang::isSubstitutedDefaultArgument(ASTContext &Ctx, TemplateArgument Arg,
 
   if (auto *TTPD = dyn_cast<TemplateTypeParmDecl>(Param)) {
     return TTPD->hasDefaultArgument() &&
-           isSubstitutedTemplateArgument(Ctx, Arg, TTPD->getDefaultArgument(),
-                                         Args, Depth);
+           isSubstitutedTemplateArgument(
+               Ctx, Arg, TTPD->getDefaultArgument().getArgument(), Args, Depth);
   } else if (auto *TTPD = dyn_cast<TemplateTemplateParmDecl>(Param)) {
     return TTPD->hasDefaultArgument() &&
            isSubstitutedTemplateArgument(
                Ctx, Arg, TTPD->getDefaultArgument().getArgument(), Args, Depth);
   } else if (auto *NTTPD = dyn_cast<NonTypeTemplateParmDecl>(Param)) {
     return NTTPD->hasDefaultArgument() &&
-           isSubstitutedTemplateArgument(Ctx, Arg, NTTPD->getDefaultArgument(),
-                                         Args, Depth);
+           isSubstitutedTemplateArgument(
+               Ctx, Arg, NTTPD->getDefaultArgument().getArgument(), Args,
+               Depth);
   }
   return false;
 }
diff --git a/clang/lib/Analysis/CFG.cpp b/clang/lib/Analysis/CFG.cpp
index 64e6155de090..02317257c274 100644
--- a/clang/lib/Analysis/CFG.cpp
+++ b/clang/lib/Analysis/CFG.cpp
@@ -556,6 +556,10 @@ public:
 
 private:
   // Visitors to walk an AST and construct the CFG.
+  CFGBlock *VisitCXXDefaultArgExpr(CXXDefaultArgExpr *Default,
+                                   AddStmtChoice asc);
+  CFGBlock *VisitCXXDefaultInitExpr(CXXDefaultInitExpr *Default,
+                                    AddStmtChoice asc);
   CFGBlock *VisitInitListExpr(InitListExpr *ILE, AddStmtChoice asc);
   CFGBlock *VisitAddrLabelExpr(AddrLabelExpr *A, AddStmtChoice asc);
   CFGBlock *VisitAttributedStmt(AttributedStmt *A, AddStmtChoice asc);
@@ -2254,16 +2258,10 @@ CFGBlock *CFGBuilder::Visit(Stmt * S, AddStmtChoice asc,
                                    asc, ExternallyDestructed);
 
     case Stmt::CXXDefaultArgExprClass:
+      return VisitCXXDefaultArgExpr(cast<CXXDefaultArgExpr>(S), asc);
+
     case Stmt::CXXDefaultInitExprClass:
-      // FIXME: The expression inside a CXXDefaultArgExpr is owned by the
-      // called function's declaration, not by the caller. If we simply add
-      // this expression to the CFG, we could end up with the same Expr
-      // appearing multiple times (PR13385).
-      //
-      // It's likewise possible for multiple CXXDefaultInitExprs for the same
-      // expression to be used in the same function (through aggregate
-      // initialization).
-      return VisitStmt(S, asc);
+      return VisitCXXDefaultInitExpr(cast<CXXDefaultInitExpr>(S), asc);
 
     case Stmt::CXXBindTemporaryExprClass:
       return VisitCXXBindTemporaryExpr(cast<CXXBindTemporaryExpr>(S), asc);
@@ -2433,6 +2431,40 @@ CFGBlock *CFGBuilder::VisitChildren(Stmt *S) {
   return B;
 }
 
+CFGBlock *CFGBuilder::VisitCXXDefaultArgExpr(CXXDefaultArgExpr *Arg,
+                                             AddStmtChoice asc) {
+  if (Arg->hasRewrittenInit()) {
+    if (asc.alwaysAdd(*this, Arg)) {
+      autoCreateBlock();
+      appendStmt(Block, Arg);
+    }
+    return VisitStmt(Arg->getExpr(), asc);
+  }
+
+  // We can't add the default argument if it's not rewritten because the
+  // expression inside a CXXDefaultArgExpr is owned by the called function's
+  // declaration, not by the caller, we could end up with the same expression
+  // appearing multiple times.
+  return VisitStmt(Arg, asc);
+}
+
+CFGBlock *CFGBuilder::VisitCXXDefaultInitExpr(CXXDefaultInitExpr *Init,
+                                              AddStmtChoice asc) {
+  if (Init->hasRewrittenInit()) {
+    if (asc.alwaysAdd(*this, Init)) {
+      autoCreateBlock();
+      appendStmt(Block, Init);
+    }
+    return VisitStmt(Init->getExpr(), asc);
+  }
+
+  // We can't add the default initializer if it's not rewritten because multiple
+  // CXXDefaultInitExprs for the same sub-expression to be used in the same
+  // function (through aggregate initialization). we could end up with the same
+  // expression appearing multiple times.
+  return VisitStmt(Init, asc);
+}
+
 CFGBlock *CFGBuilder::VisitInitListExpr(InitListExpr *ILE, AddStmtChoice asc) {
   if (asc.alwaysAdd(*this, ILE)) {
     autoCreateBlock();
diff --git a/clang/lib/Basic/FileManager.cpp b/clang/lib/Basic/FileManager.cpp
index 143c04309d07..1dc51deb8298 100644
--- a/clang/lib/Basic/FileManager.cpp
+++ b/clang/lib/Basic/FileManager.cpp
@@ -82,6 +82,22 @@ getDirectoryFromFile(FileManager &FileMgr, StringRef Filename,
   return FileMgr.getDirectoryRef(DirName, CacheFailure);
 }
 
+DirectoryEntry *&FileManager::getRealDirEntry(const llvm::vfs::Status &Status) {
+  assert(Status.isDirectory() && "The directory should exist!");
+  // See if we have already opened a directory with the
+  // same inode (this occurs on Unix-like systems when one dir is
+  // symlinked to another, for example) or the same path (on
+  // Windows).
+  DirectoryEntry *&UDE = UniqueRealDirs[Status.getUniqueID()];
+
+  if (!UDE) {
+    // We don't have this directory yet, add it.  We use the string
+    // key from the SeenDirEntries map as the string.
+    UDE = new (DirsAlloc.Allocate()) DirectoryEntry();
+  }
+  return UDE;
+}
+
 /// Add all ancestors of the given path (pointing to either a file or
 /// a directory) as virtual directories.
 void FileManager::addAncestorsAsVirtualDirs(StringRef Path) {
@@ -99,10 +115,21 @@ void FileManager::addAncestorsAsVirtualDirs(StringRef Path) {
   if (NamedDirEnt.second)
     return;
 
-  // Add the virtual directory to the cache.
-  auto *UDE = new (DirsAlloc.Allocate()) DirectoryEntry();
-  NamedDirEnt.second = *UDE;
-  VirtualDirectoryEntries.push_back(UDE);
+  // Check to see if the directory exists.
+  llvm::vfs::Status Status;
+  auto statError =
+      getStatValue(DirName, Status, false, nullptr /*directory lookup*/);
+  if (statError) {
+    // There's no real directory at the given path.
+    // Add the virtual directory to the cache.
+    auto *UDE = new (DirsAlloc.Allocate()) DirectoryEntry();
+    NamedDirEnt.second = *UDE;
+    VirtualDirectoryEntries.push_back(UDE);
+  } else {
+    // There is the real directory
+    DirectoryEntry *&UDE = getRealDirEntry(Status);
+    NamedDirEnt.second = *UDE;
+  }
 
   // Recursively add the other ancestors.
   addAncestorsAsVirtualDirs(DirName);
@@ -162,17 +189,8 @@ FileManager::getDirectoryRef(StringRef DirName, bool CacheFailure) {
     return llvm::errorCodeToError(statError);
   }
 
-  // It exists.  See if we have already opened a directory with the
-  // same inode (this occurs on Unix-like systems when one dir is
-  // symlinked to another, for example) or the same path (on
-  // Windows).
-  DirectoryEntry *&UDE = UniqueRealDirs[Status.getUniqueID()];
-
-  if (!UDE) {
-    // We don't have this directory yet, add it.  We use the string
-    // key from the SeenDirEntries map as the string.
-    UDE = new (DirsAlloc.Allocate()) DirectoryEntry();
-  }
+  // It exists.
+  DirectoryEntry *&UDE = getRealDirEntry(Status);
   NamedDirEnt.second = *UDE;
 
   return DirectoryEntryRef(NamedDirEnt);
diff --git a/clang/lib/Basic/Targets/Mips.cpp b/clang/lib/Basic/Targets/Mips.cpp
index 3a65f53c5248..174bc9d2ab99 100644
--- a/clang/lib/Basic/Targets/Mips.cpp
+++ b/clang/lib/Basic/Targets/Mips.cpp
@@ -273,6 +273,34 @@ bool MipsTargetInfo::validateTarget(DiagnosticsEngine &Diags) const {
     Diags.Report(diag::err_mips_fp64_req) << "-mfp64";
     return false;
   }
+  // FPXX requires mips2+
+  if (FPMode == FPXX && CPU == "mips1") {
+    Diags.Report(diag::err_opt_not_valid_with_opt) << "-mfpxx" << CPU;
+    return false;
+  }
+  // -mmsa with -msoft-float makes nonsense
+  if (FloatABI == SoftFloat && HasMSA) {
+    Diags.Report(diag::err_opt_not_valid_with_opt) << "-msoft-float"
+                                                   << "-mmsa";
+    return false;
+  }
+  // Option -mmsa permitted on Mips32 iff revision 2 or higher is present
+  if (HasMSA && (CPU == "mips1" || CPU == "mips2" || getISARev() < 2) &&
+      ABI == "o32") {
+    Diags.Report(diag::err_mips_fp64_req) << "-mmsa";
+    return false;
+  }
+  // MSA requires FP64
+  if (FPMode == FPXX && HasMSA) {
+    Diags.Report(diag::err_opt_not_valid_with_opt) << "-mfpxx"
+                                                   << "-mmsa";
+    return false;
+  }
+  if (FPMode == FP32 && HasMSA) {
+    Diags.Report(diag::err_opt_not_valid_with_opt) << "-mfp32"
+                                                   << "-mmsa";
+    return false;
+  }
 
   return true;
 }
diff --git a/clang/lib/Basic/Targets/WebAssembly.h b/clang/lib/Basic/Targets/WebAssembly.h
index 4db97867df60..e4a449d1ff30 100644
--- a/clang/lib/Basic/Targets/WebAssembly.h
+++ b/clang/lib/Basic/Targets/WebAssembly.h
@@ -90,6 +90,9 @@ public:
 
   StringRef getABI() const override;
   bool setABI(const std::string &Name) override;
+  bool useFP16ConversionIntrinsics() const override {
+    return !HasHalfPrecision;
+  }
 
 protected:
   void getTargetDefines(const LangOptions &Opts,
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index b823eaf6ce33..3a30cff917bb 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -310,15 +310,9 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasAVX512VNNI = true;
     } else if (Feature == "+avx512bf16") {
       HasAVX512BF16 = true;
-    } else if (Feature == "+avx512er") {
-      HasAVX512ER = true;
-      Diags.Report(diag::warn_knl_knm_isa_support_removed);
     } else if (Feature == "+avx512fp16") {
       HasAVX512FP16 = true;
       HasLegalHalfType = true;
-    } else if (Feature == "+avx512pf") {
-      HasAVX512PF = true;
-      Diags.Report(diag::warn_knl_knm_isa_support_removed);
     } else if (Feature == "+avx512dq") {
       HasAVX512DQ = true;
     } else if (Feature == "+avx512bitalg") {
@@ -375,9 +369,6 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasWBNOINVD = true;
     } else if (Feature == "+prefetchi") {
       HasPREFETCHI = true;
-    } else if (Feature == "+prefetchwt1") {
-      HasPREFETCHWT1 = true;
-      Diags.Report(diag::warn_knl_knm_isa_support_removed);
     } else if (Feature == "+clzero") {
       HasCLZERO = true;
     } else if (Feature == "+cldemote") {
@@ -840,12 +831,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__AVX512VNNI__");
   if (HasAVX512BF16)
     Builder.defineMacro("__AVX512BF16__");
-  if (HasAVX512ER)
-    Builder.defineMacro("__AVX512ER__");
   if (HasAVX512FP16)
     Builder.defineMacro("__AVX512FP16__");
-  if (HasAVX512PF)
-    Builder.defineMacro("__AVX512PF__");
   if (HasAVX512DQ)
     Builder.defineMacro("__AVX512DQ__");
   if (HasAVX512BITALG)
@@ -897,8 +884,6 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__SM4__");
   if (HasPREFETCHI)
     Builder.defineMacro("__PREFETCHI__");
-  if (HasPREFETCHWT1)
-    Builder.defineMacro("__PREFETCHWT1__");
   if (HasCLZERO)
     Builder.defineMacro("__CLZERO__");
   if (HasKL)
@@ -1084,9 +1069,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
       .Case("avx512vpopcntdq", true)
       .Case("avx512vnni", true)
       .Case("avx512bf16", true)
-      .Case("avx512er", true)
       .Case("avx512fp16", true)
-      .Case("avx512pf", true)
       .Case("avx512dq", true)
       .Case("avx512bitalg", true)
       .Case("avx512bw", true)
@@ -1134,7 +1117,6 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
       .Case("pku", true)
       .Case("popcnt", true)
       .Case("prefetchi", true)
-      .Case("prefetchwt1", true)
       .Case("prfchw", true)
       .Case("ptwrite", true)
       .Case("raoint", true)
@@ -1201,9 +1183,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
       .Case("avx512vpopcntdq", HasAVX512VPOPCNTDQ)
       .Case("avx512vnni", HasAVX512VNNI)
       .Case("avx512bf16", HasAVX512BF16)
-      .Case("avx512er", HasAVX512ER)
       .Case("avx512fp16", HasAVX512FP16)
-      .Case("avx512pf", HasAVX512PF)
       .Case("avx512dq", HasAVX512DQ)
       .Case("avx512bitalg", HasAVX512BITALG)
       .Case("avx512bw", HasAVX512BW)
@@ -1253,7 +1233,6 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
       .Case("pku", HasPKU)
       .Case("popcnt", HasPOPCNT)
       .Case("prefetchi", HasPREFETCHI)
-      .Case("prefetchwt1", HasPREFETCHWT1)
       .Case("prfchw", HasPRFCHW)
       .Case("ptwrite", HasPTWRITE)
       .Case("raoint", HasRAOINT)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index 6a0a6cb84203..0633b7e0da96 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -103,8 +103,6 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
   bool HasAVX512VNNI = false;
   bool HasAVX512FP16 = false;
   bool HasAVX512BF16 = false;
-  bool HasAVX512ER = false;
-  bool HasAVX512PF = false;
   bool HasAVX512DQ = false;
   bool HasAVX512BITALG = false;
   bool HasAVX512BW = false;
@@ -136,7 +134,6 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
   bool HasCLWB = false;
   bool HasMOVBE = false;
   bool HasPREFETCHI = false;
-  bool HasPREFETCHWT1 = false;
   bool HasRDPID = false;
   bool HasRDPRU = false;
   bool HasRetpolineExternalThunk = false;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index ba94bf89e475..0549afa12e43 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -21230,6 +21230,17 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
     Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_storef16_f32);
     return Builder.CreateCall(Callee, {Val, Addr});
   }
+  case WebAssembly::BI__builtin_wasm_splat_f16x8: {
+    Value *Val = EmitScalarExpr(E->getArg(0));
+    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_splat_f16x8);
+    return Builder.CreateCall(Callee, {Val});
+  }
+  case WebAssembly::BI__builtin_wasm_extract_lane_f16x8: {
+    Value *Vector = EmitScalarExpr(E->getArg(0));
+    Value *Index = EmitScalarExpr(E->getArg(1));
+    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_extract_lane_f16x8);
+    return Builder.CreateCall(Callee, {Vector, Index});
+  }
   case WebAssembly::BI__builtin_wasm_table_get: {
     assert(E->getArg(0)->getType()->isArrayType());
     Value *Table = EmitArrayToPointerDecay(E->getArg(0)).emitRawPointer(*this);
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index cd1c48b42038..d6478cc6835d 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -317,8 +317,8 @@ pushTemporaryCleanup(CodeGenFunction &CGF, const MaterializeTemporaryExpr *M,
         CleanupKind CleanupKind;
         if (Lifetime == Qualifiers::OCL_Strong) {
           const ValueDecl *VD = M->getExtendingDecl();
-          bool Precise =
-              VD && isa<VarDecl>(VD) && VD->hasAttr<ObjCPreciseLifetimeAttr>();
+          bool Precise = isa_and_nonnull<VarDecl>(VD) &&
+                         VD->hasAttr<ObjCPreciseLifetimeAttr>();
           CleanupKind = CGF.getARCCleanupKind();
           Destroy = Precise ? &CodeGenFunction::destroyARCStrongPrecise
                             : &CodeGenFunction::destroyARCStrongImprecise;
@@ -4180,7 +4180,7 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E,
 
   // If the base is a vector type, then we are forming a vector element lvalue
   // with this subscript.
-  if (E->getBase()->getType()->isVectorType() &&
+  if (E->getBase()->getType()->isSubscriptableVectorType() &&
       !isa<ExtVectorElementExpr>(E->getBase())) {
     // Emit the vector as an lvalue to get its address.
     LValue LHS = EmitLValue(E->getBase());
@@ -4676,7 +4676,8 @@ LValue CodeGenFunction::EmitMemberExpr(const MemberExpr *E) {
 LValue CodeGenFunction::EmitLValueForLambdaField(const FieldDecl *Field,
                                                  llvm::Value *ThisValue) {
   bool HasExplicitObjectParameter = false;
-  if (const auto *MD = dyn_cast_if_present<CXXMethodDecl>(CurCodeDecl)) {
+  const auto *MD = dyn_cast_if_present<CXXMethodDecl>(CurCodeDecl);
+  if (MD) {
     HasExplicitObjectParameter = MD->isExplicitObjectMemberFunction();
     assert(MD->getParent()->isLambda());
     assert(MD->getParent() == Field->getParent());
@@ -4693,6 +4694,17 @@ LValue CodeGenFunction::EmitLValueForLambdaField(const FieldDecl *Field,
     else
       LambdaLV = MakeAddrLValue(AddrOfExplicitObject,
                                 D->getType().getNonReferenceType());
+
+    // Make sure we have an lvalue to the lambda itself and not a derived class.
+    auto *ThisTy = D->getType().getNonReferenceType()->getAsCXXRecordDecl();
+    auto *LambdaTy = cast<CXXRecordDecl>(Field->getParent());
+    if (ThisTy != LambdaTy) {
+      const CXXCastPath &BasePathArray = getContext().LambdaCastPaths.at(MD);
+      Address Base = GetAddressOfBaseClass(
+          LambdaLV.getAddress(), ThisTy, BasePathArray.begin(),
+          BasePathArray.end(), /*NullCheckValue=*/false, SourceLocation());
+      LambdaLV = MakeAddrLValue(Base, QualType{LambdaTy->getTypeForDecl(), 0});
+    }
   } else {
     QualType LambdaTagType = getContext().getTagDeclType(Field->getParent());
     LambdaLV = MakeNaturalAlignAddrLValue(ThisValue, LambdaTagType);
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index eac5ef326293..6410f9e102c9 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -142,7 +142,7 @@ public:
 /// of used expression from loop statement.
 class OMPLoopScope : public CodeGenFunction::RunCleanupsScope {
   void emitPreInitStmt(CodeGenFunction &CGF, const OMPLoopBasedDirective &S) {
-    const DeclStmt *PreInits;
+    const Stmt *PreInits;
     CodeGenFunction::OMPMapVars PreCondVars;
     if (auto *LD = dyn_cast<OMPLoopDirective>(&S)) {
       llvm::DenseSet<const VarDecl *> EmittedAsPrivate;
@@ -182,17 +182,34 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope {
             }
             return false;
           });
-      PreInits = cast_or_null<DeclStmt>(LD->getPreInits());
+      PreInits = LD->getPreInits();
     } else if (const auto *Tile = dyn_cast<OMPTileDirective>(&S)) {
-      PreInits = cast_or_null<DeclStmt>(Tile->getPreInits());
+      PreInits = Tile->getPreInits();
     } else if (const auto *Unroll = dyn_cast<OMPUnrollDirective>(&S)) {
-      PreInits = cast_or_null<DeclStmt>(Unroll->getPreInits());
+      PreInits = Unroll->getPreInits();
     } else {
       llvm_unreachable("Unknown loop-based directive kind.");
     }
     if (PreInits) {
-      for (const auto *I : PreInits->decls())
-        CGF.EmitVarDecl(cast<VarDecl>(*I));
+      // CompoundStmts and DeclStmts are used as lists of PreInit statements and
+      // declarations. Since declarations must be visible in the the following
+      // that they initialize, unpack the ComboundStmt they are nested in.
+      SmallVector<const Stmt *> PreInitStmts;
+      if (auto *PreInitCompound = dyn_cast<CompoundStmt>(PreInits))
+        llvm::append_range(PreInitStmts, PreInitCompound->body());
+      else
+        PreInitStmts.push_back(PreInits);
+
+      for (const Stmt *S : PreInitStmts) {
+        // EmitStmt skips any OMPCapturedExprDecls, but needs to be emitted
+        // here.
+        if (auto *PreInitDecl = dyn_cast<DeclStmt>(S)) {
+          for (Decl *I : PreInitDecl->decls())
+            CGF.EmitVarDecl(cast<VarDecl>(*I));
+          continue;
+        }
+        CGF.EmitStmt(S);
+      }
     }
     PreCondVars.restore(CGF);
   }
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 227813ad44e8..e4774a587707 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -4150,7 +4150,7 @@ llvm::GlobalValue::LinkageTypes getMultiversionLinkage(CodeGenModule &CGM,
 }
 
 static FunctionDecl *createDefaultTargetVersionFrom(const FunctionDecl *FD) {
-  DeclContext *DeclCtx = FD->getASTContext().getTranslationUnitDecl();
+  auto *DeclCtx = const_cast<DeclContext *>(FD->getDeclContext());
   TypeSourceInfo *TInfo = FD->getTypeSourceInfo();
   StorageClass SC = FD->getStorageClass();
   DeclarationName Name = FD->getNameInfo().getName();
@@ -5740,15 +5740,17 @@ CodeGenModule::getLLVMLinkageVarDefinition(const VarDecl *VD) {
 static void replaceUsesOfNonProtoConstant(llvm::Constant *old,
                                           llvm::Function *newFn) {
   // Fast path.
-  if (old->use_empty()) return;
+  if (old->use_empty())
+    return;
 
   llvm::Type *newRetTy = newFn->getReturnType();
-  SmallVector<llvm::Value*, 4> newArgs;
+  SmallVector<llvm::Value *, 4> newArgs;
+
+  SmallVector<llvm::CallBase *> callSitesToBeRemovedFromParent;
 
   for (llvm::Value::use_iterator ui = old->use_begin(), ue = old->use_end();
-         ui != ue; ) {
-    llvm::Value::use_iterator use = ui++; // Increment before the use is erased.
-    llvm::User *user = use->getUser();
+       ui != ue; ui++) {
+    llvm::User *user = ui->getUser();
 
     // Recognize and replace uses of bitcasts.  Most calls to
     // unprototyped functions will use bitcasts.
@@ -5760,8 +5762,9 @@ static void replaceUsesOfNonProtoConstant(llvm::Constant *old,
 
     // Recognize calls to the function.
     llvm::CallBase *callSite = dyn_cast<llvm::CallBase>(user);
-    if (!callSite) continue;
-    if (!callSite->isCallee(&*use))
+    if (!callSite)
+      continue;
+    if (!callSite->isCallee(&*ui))
       continue;
 
     // If the return types don't match exactly, then we can't
@@ -5830,6 +5833,10 @@ static void replaceUsesOfNonProtoConstant(llvm::Constant *old,
     if (callSite->getDebugLoc())
       newCall->setDebugLoc(callSite->getDebugLoc());
 
+    callSitesToBeRemovedFromParent.push_back(callSite);
+  }
+
+  for (auto *callSite : callSitesToBeRemovedFromParent) {
     callSite->eraseFromParent();
   }
 }
diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index f4de21bac4b4..6ce2d32dd292 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -191,6 +191,10 @@ public:
 
   bool isBranch() const { return FalseCount.has_value(); }
 
+  bool isMCDCBranch() const {
+    return std::holds_alternative<mcdc::BranchParameters>(MCDCParams);
+  }
+
   bool isMCDCDecision() const {
     return std::holds_alternative<mcdc::DecisionParameters>(MCDCParams);
   }
@@ -290,10 +294,36 @@ public:
     return SM.getLocForEndOfFile(SM.getFileID(Loc));
   }
 
-  /// Find out where the current file is included or macro is expanded.
-  SourceLocation getIncludeOrExpansionLoc(SourceLocation Loc) {
-    return Loc.isMacroID() ? SM.getImmediateExpansionRange(Loc).getBegin()
-                           : SM.getIncludeLoc(SM.getFileID(Loc));
+  /// Find out where a macro is expanded. If the immediate result is a
+  /// <scratch space>, keep looking until the result isn't. Return a pair of
+  /// \c SourceLocation. The first object is always the begin sloc of found
+  /// result. The second should be checked by the caller: if it has value, it's
+  /// the end sloc of the found result. Otherwise the while loop didn't get
+  /// executed, which means the location wasn't changed and the caller has to
+  /// learn the end sloc from somewhere else.
+  std::pair<SourceLocation, std::optional<SourceLocation>>
+  getNonScratchExpansionLoc(SourceLocation Loc) {
+    std::optional<SourceLocation> EndLoc = std::nullopt;
+    while (Loc.isMacroID() &&
+           SM.isWrittenInScratchSpace(SM.getSpellingLoc(Loc))) {
+      auto ExpansionRange = SM.getImmediateExpansionRange(Loc);
+      Loc = ExpansionRange.getBegin();
+      EndLoc = ExpansionRange.getEnd();
+    }
+    return std::make_pair(Loc, EndLoc);
+  }
+
+  /// Find out where the current file is included or macro is expanded. If
+  /// \c AcceptScratch is set to false, keep looking for expansions until the
+  /// found sloc is not a <scratch space>.
+  SourceLocation getIncludeOrExpansionLoc(SourceLocation Loc,
+                                          bool AcceptScratch = true) {
+    if (!Loc.isMacroID())
+      return SM.getIncludeLoc(SM.getFileID(Loc));
+    Loc = SM.getImmediateExpansionRange(Loc).getBegin();
+    if (AcceptScratch)
+      return Loc;
+    return getNonScratchExpansionLoc(Loc).first;
   }
 
   /// Return true if \c Loc is a location in a built-in macro.
@@ -340,6 +370,15 @@ public:
     for (auto &Region : SourceRegions) {
       SourceLocation Loc = Region.getBeginLoc();
 
+      // Replace Region with its definition if it is in <scratch space>.
+      auto NonScratchExpansionLoc = getNonScratchExpansionLoc(Loc);
+      auto EndLoc = NonScratchExpansionLoc.second;
+      if (EndLoc.has_value()) {
+        Loc = NonScratchExpansionLoc.first;
+        Region.setStartLoc(Loc);
+        Region.setEndLoc(EndLoc.value());
+      }
+
       // Replace Loc with FileLoc if it is expanded with system headers.
       if (!SystemHeadersCoverage && SM.isInSystemMacro(Loc)) {
         auto BeginLoc = SM.getSpellingLoc(Loc);
@@ -472,13 +511,19 @@ public:
       // Ignore regions from system headers unless collecting coverage from
       // system headers is explicitly enabled.
       if (!SystemHeadersCoverage &&
-          SM.isInSystemHeader(SM.getSpellingLoc(LocStart)))
+          SM.isInSystemHeader(SM.getSpellingLoc(LocStart))) {
+        assert(!Region.isMCDCBranch() && !Region.isMCDCDecision() &&
+               "Don't suppress the condition in system headers");
         continue;
+      }
 
       auto CovFileID = getCoverageFileID(LocStart);
       // Ignore regions that don't have a file, such as builtin macros.
-      if (!CovFileID)
+      if (!CovFileID) {
+        assert(!Region.isMCDCBranch() && !Region.isMCDCDecision() &&
+               "Don't suppress the condition in non-file regions");
         continue;
+      }
 
       SourceLocation LocEnd = Region.getEndLoc();
       assert(SM.isWrittenInSameFile(LocStart, LocEnd) &&
@@ -488,8 +533,11 @@ public:
       // This not only suppresses redundant regions, but sometimes prevents
       // creating regions with wrong counters if, for example, a statement's
       // body ends at the end of a nested macro.
-      if (Filter.count(std::make_pair(LocStart, LocEnd)))
+      if (Filter.count(std::make_pair(LocStart, LocEnd))) {
+        assert(!Region.isMCDCBranch() && !Region.isMCDCDecision() &&
+               "Don't suppress the condition");
         continue;
+      }
 
       // Find the spelling locations for the mapping region.
       SpellingRegion SR{SM, LocStart, LocEnd};
@@ -525,7 +573,7 @@ public:
     SourceRegionFilter Filter;
     for (const auto &FM : FileIDMapping) {
       SourceLocation ExpandedLoc = FM.second.second;
-      SourceLocation ParentLoc = getIncludeOrExpansionLoc(ExpandedLoc);
+      SourceLocation ParentLoc = getIncludeOrExpansionLoc(ExpandedLoc, false);
       if (ParentLoc.isInvalid())
         continue;
 
@@ -2223,7 +2271,8 @@ struct CounterCoverageMappingBuilder
   }
 
   void VisitOpaqueValueExpr(const OpaqueValueExpr* OVE) {
-    Visit(OVE->getSourceExpr());
+    if (OVE->isUnique())
+      Visit(OVE->getSourceExpr());
   }
 };
 
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 18acf7784f71..8427286dee88 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -1793,6 +1793,37 @@ void ItaniumCXXABI::EmitDestructorCall(CodeGenFunction &CGF,
                             ThisTy, VTT, VTTTy, nullptr);
 }
 
+// Check if any non-inline method has the specified attribute.
+template <typename T>
+static bool CXXRecordNonInlineHasAttr(const CXXRecordDecl *RD) {
+  for (const auto *D : RD->noload_decls()) {
+    if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
+      if (FD->isInlined() || FD->doesThisDeclarationHaveABody() ||
+          FD->isPureVirtual())
+        continue;
+      if (D->hasAttr<T>())
+        return true;
+    }
+  }
+
+  return false;
+}
+
+static void setVTableSelectiveDLLImportExport(CodeGenModule &CGM,
+                                              llvm::GlobalVariable *VTable,
+                                              const CXXRecordDecl *RD) {
+  if (VTable->getDLLStorageClass() !=
+          llvm::GlobalVariable::DefaultStorageClass ||
+      RD->hasAttr<DLLImportAttr>() || RD->hasAttr<DLLExportAttr>())
+    return;
+
+  if (CGM.getVTables().isVTableExternal(RD)) {
+    if (CXXRecordNonInlineHasAttr<DLLImportAttr>(RD))
+      VTable->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass);
+  } else if (CXXRecordNonInlineHasAttr<DLLExportAttr>(RD))
+    VTable->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
+}
+
 void ItaniumCXXABI::emitVTableDefinitions(CodeGenVTables &CGVT,
                                           const CXXRecordDecl *RD) {
   llvm::GlobalVariable *VTable = getAddrOfVTable(RD, CharUnits());
@@ -1818,6 +1849,9 @@ void ItaniumCXXABI::emitVTableDefinitions(CodeGenVTables &CGVT,
   if (CGM.supportsCOMDAT() && VTable->isWeakForLinker())
     VTable->setComdat(CGM.getModule().getOrInsertComdat(VTable->getName()));
 
+  if (CGM.getTarget().hasPS4DLLImportExport())
+    setVTableSelectiveDLLImportExport(CGM, VTable, RD);
+
   // Set the right visibility.
   CGM.setGVProperties(VTable, RD);
 
@@ -1905,29 +1939,6 @@ ItaniumCXXABI::getVTableAddressPoint(BaseSubobject Base,
       VTable->getValueType(), VTable, Indices, /*InBounds=*/true, InRange);
 }
 
-// Check whether all the non-inline virtual methods for the class have the
-// specified attribute.
-template <typename T>
-static bool CXXRecordAllNonInlineVirtualsHaveAttr(const CXXRecordDecl *RD) {
-  bool FoundNonInlineVirtualMethodWithAttr = false;
-  for (const auto *D : RD->noload_decls()) {
-    if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
-      if (!FD->isVirtualAsWritten() || FD->isInlineSpecified() ||
-          FD->doesThisDeclarationHaveABody())
-        continue;
-      if (!D->hasAttr<T>())
-        return false;
-      FoundNonInlineVirtualMethodWithAttr = true;
-    }
-  }
-
-  // We didn't find any non-inline virtual methods missing the attribute.  We
-  // will return true when we found at least one non-inline virtual with the
-  // attribute.  (This lets our caller know that the attribute needs to be
-  // propagated up to the vtable.)
-  return FoundNonInlineVirtualMethodWithAttr;
-}
-
 llvm::Value *ItaniumCXXABI::getVTableAddressPointInStructorWithVTT(
     CodeGenFunction &CGF, const CXXRecordDecl *VTableClass, BaseSubobject Base,
     const CXXRecordDecl *NearestVBase) {
@@ -1981,26 +1992,10 @@ llvm::GlobalVariable *ItaniumCXXABI::getAddrOfVTable(const CXXRecordDecl *RD,
       getContext().toCharUnitsFromBits(PAlign).getAsAlign());
   VTable->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
 
-  // In MS C++ if you have a class with virtual functions in which you are using
-  // selective member import/export, then all virtual functions must be exported
-  // unless they are inline, otherwise a link error will result. To match this
-  // behavior, for such classes, we dllimport the vtable if it is defined
-  // externally and all the non-inline virtual methods are marked dllimport, and
-  // we dllexport the vtable if it is defined in this TU and all the non-inline
-  // virtual methods are marked dllexport.
-  if (CGM.getTarget().hasPS4DLLImportExport()) {
-    if ((!RD->hasAttr<DLLImportAttr>()) && (!RD->hasAttr<DLLExportAttr>())) {
-      if (CGM.getVTables().isVTableExternal(RD)) {
-        if (CXXRecordAllNonInlineVirtualsHaveAttr<DLLImportAttr>(RD))
-          VTable->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass);
-      } else {
-        if (CXXRecordAllNonInlineVirtualsHaveAttr<DLLExportAttr>(RD))
-          VTable->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
-      }
-    }
-  }
-  CGM.setGVProperties(VTable, RD);
+  if (CGM.getTarget().hasPS4DLLImportExport())
+    setVTableSelectiveDLLImportExport(CGM, VTable, RD);
 
+  CGM.setGVProperties(VTable, RD);
   return VTable;
 }
 
@@ -3285,7 +3280,7 @@ ItaniumRTTIBuilder::GetAddrOfExternalRTTIDescriptor(QualType Ty) {
     // Import the typeinfo symbol when all non-inline virtual methods are
     // imported.
     if (CGM.getTarget().hasPS4DLLImportExport()) {
-      if (RD && CXXRecordAllNonInlineVirtualsHaveAttr<DLLImportAttr>(RD)) {
+      if (RD && CXXRecordNonInlineHasAttr<DLLImportAttr>(RD)) {
         GV->setDLLStorageClass(llvm::GlobalVariable::DLLImportStorageClass);
         CGM.setDSOLocal(GV);
       }
@@ -3938,13 +3933,13 @@ llvm::Constant *ItaniumRTTIBuilder::BuildTypeInfo(
 
   // Export the typeinfo in the same circumstances as the vtable is exported.
   auto GVDLLStorageClass = DLLStorageClass;
-  if (CGM.getTarget().hasPS4DLLImportExport()) {
+  if (CGM.getTarget().hasPS4DLLImportExport() &&
+      GVDLLStorageClass != llvm::GlobalVariable::DLLExportStorageClass) {
     if (const RecordType *RecordTy = dyn_cast<RecordType>(Ty)) {
       const CXXRecordDecl *RD = cast<CXXRecordDecl>(RecordTy->getDecl());
       if (RD->hasAttr<DLLExportAttr>() ||
-          CXXRecordAllNonInlineVirtualsHaveAttr<DLLExportAttr>(RD)) {
+          CXXRecordNonInlineHasAttr<DLLExportAttr>(RD))
         GVDLLStorageClass = llvm::GlobalVariable::DLLExportStorageClass;
-      }
     }
   }
 
@@ -3984,9 +3979,7 @@ llvm::Constant *ItaniumRTTIBuilder::BuildTypeInfo(
   CGM.setDSOLocal(GV);
 
   TypeName->setDLLStorageClass(DLLStorageClass);
-  GV->setDLLStorageClass(CGM.getTarget().hasPS4DLLImportExport()
-                             ? GVDLLStorageClass
-                             : DLLStorageClass);
+  GV->setDLLStorageClass(GVDLLStorageClass);
 
   TypeName->setPartition(CGM.getCodeGenOpts().SymbolPartition);
   GV->setPartition(CGM.getCodeGenOpts().SymbolPartition);
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 2868b4f2b02e..f5ea73a04ae5 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -2653,22 +2653,13 @@ void Driver::BuildInputs(const ToolChain &TC, DerivedArgList &Args,
       Diag(clang::diag::note_drv_t_option_is_global);
   }
 
-  // CUDA/HIP and their preprocessor expansions can be accepted by CL mode.
   // Warn -x after last input file has no effect
-  auto LastXArg = Args.getLastArgValue(options::OPT_x);
-  const llvm::StringSet<> ValidXArgs = {"cuda", "hip", "cui", "hipi"};
-  if (!IsCLMode() || ValidXArgs.contains(LastXArg)) {
+  {
     Arg *LastXArg = Args.getLastArgNoClaim(options::OPT_x);
     Arg *LastInputArg = Args.getLastArgNoClaim(options::OPT_INPUT);
     if (LastXArg && LastInputArg &&
         LastInputArg->getIndex() < LastXArg->getIndex())
       Diag(clang::diag::warn_drv_unused_x) << LastXArg->getValue();
-  } else {
-    // In CL mode suggest /TC or /TP since -x doesn't make sense if passed via
-    // /clang:.
-    if (auto *A = Args.getLastArg(options::OPT_x))
-      Diag(diag::err_drv_unsupported_opt_with_suggestion)
-          << A->getAsString(Args) << "/TC' or '/TP";
   }
 
   for (Arg *A : Args) {
diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
index 85825e1ea65b..381d72e045b9 100644
--- a/clang/lib/Driver/ToolChains/AIX.cpp
+++ b/clang/lib/Driver/ToolChains/AIX.cpp
@@ -479,14 +479,6 @@ static void addTocDataOptions(const llvm::opt::ArgList &Args,
       return false;
   }();
 
-  // Currently only supported for small code model.
-  if (TOCDataGloballyinEffect &&
-      (Args.getLastArgValue(options::OPT_mcmodel_EQ) == "large" ||
-       Args.getLastArgValue(options::OPT_mcmodel_EQ) == "medium")) {
-    D.Diag(clang::diag::warn_drv_unsupported_tocdata);
-    return;
-  }
-
   enum TOCDataSetting {
     AddressInTOC = 0, // Address of the symbol stored in the TOC.
     DataInTOC = 1     // Symbol defined in the TOC.
diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
index d23f9b36efb9..9ea4cc3f7cb9 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@@ -181,7 +181,7 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     // -m*-float and -mfpu=none/0/32 conflict with -mlsx.
     if (A->getOption().matches(options::OPT_mlsx)) {
       if (llvm::find(Features, "-d") != Features.end())
-        D.Diag(diag::err_drv_loongarch_wrong_fpu_width_for_lsx);
+        D.Diag(diag::err_drv_loongarch_wrong_fpu_width) << /*LSX*/ 0;
       else /*-mlsx*/
         Features.push_back("+lsx");
     } else /*-mno-lsx*/ {
@@ -196,7 +196,7 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     // -mno-lsx conflicts with -mlasx.
     if (A->getOption().matches(options::OPT_mlasx)) {
       if (llvm::find(Features, "-d") != Features.end())
-        D.Diag(diag::err_drv_loongarch_wrong_fpu_width_for_lasx);
+        D.Diag(diag::err_drv_loongarch_wrong_fpu_width) << /*LASX*/ 1;
       else if (llvm::find(Features, "-lsx") != Features.end())
         D.Diag(diag::err_drv_loongarch_invalid_simd_option_combination);
       else { /*-mlasx*/
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 6d2015b2cd15..97e451cfe2ac 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1030,7 +1030,7 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
 
       // If user provided -o, that is the dependency target, except
       // when we are only generating a dependency file.
-      Arg *OutputOpt = Args.getLastArg(options::OPT_o);
+      Arg *OutputOpt = Args.getLastArg(options::OPT_o, options::OPT__SLASH_Fo);
       if (OutputOpt && Output.getType() != types::TY_Dependencies) {
         DepTarget = OutputOpt->getValue();
       } else {
@@ -5681,11 +5681,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   // enabled.  This alias option is being used to simplify the hasFlag logic.
   OptSpecifier StrictAliasingAliasOption =
       OFastEnabled ? options::OPT_Ofast : options::OPT_fstrict_aliasing;
-  // We turn strict aliasing off by default if we're in CL mode, since MSVC
+  // We turn strict aliasing off by default if we're Windows MSVC since MSVC
   // doesn't do any TBAA.
-  bool TBAAOnByDefault = !D.IsCLMode();
   if (!Args.hasFlag(options::OPT_fstrict_aliasing, StrictAliasingAliasOption,
-                    options::OPT_fno_strict_aliasing, TBAAOnByDefault))
+                    options::OPT_fno_strict_aliasing, !IsWindowsMSVC))
     CmdArgs.push_back("-relaxed-aliasing");
   if (!Args.hasFlag(options::OPT_fstruct_path_tbaa,
                     options::OPT_fno_struct_path_tbaa, true))
@@ -7027,8 +7026,12 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       options::OPT_fms_compatibility, options::OPT_fno_ms_compatibility,
       (IsWindowsMSVC && Args.hasFlag(options::OPT_fms_extensions,
                                      options::OPT_fno_ms_extensions, true)));
-  if (IsMSVCCompat)
+  if (IsMSVCCompat) {
     CmdArgs.push_back("-fms-compatibility");
+    if (!types::isCXX(Input.getType()) &&
+        Args.hasArg(options::OPT_fms_define_stdc))
+      CmdArgs.push_back("-fms-define-stdc");
+  }
 
   if (Triple.isWindowsMSVCEnvironment() && !D.IsCLMode() &&
       Args.hasArg(options::OPT_fms_runtime_lib_EQ))
@@ -7263,10 +7266,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
 
-  // -fsized-deallocation is off by default, as it is an ABI-breaking change for
-  // most platforms.
-  Args.addOptInFlag(CmdArgs, options::OPT_fsized_deallocation,
-                    options::OPT_fno_sized_deallocation);
+  // -fsized-deallocation is on by default in C++14 onwards and otherwise off
+  // by default.
+  Args.addLastArg(CmdArgs, options::OPT_fsized_deallocation,
+                  options::OPT_fno_sized_deallocation);
 
   // -faligned-allocation is on by default in C++17 onwards and otherwise off
   // by default.
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index caf6c4a444fd..593b403a1e3f 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -2912,9 +2912,54 @@ static bool sdkSupportsBuiltinModules(const Darwin::DarwinPlatformKind &TargetPl
   }
 }
 
-void Darwin::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
-                                   llvm::opt::ArgStringList &CC1Args,
-                                   Action::OffloadKind DeviceOffloadKind) const {
+static inline llvm::VersionTuple
+sizedDeallocMinVersion(llvm::Triple::OSType OS) {
+  switch (OS) {
+  default:
+    break;
+  case llvm::Triple::Darwin:
+  case llvm::Triple::MacOSX: // Earliest supporting version is 10.12.
+    return llvm::VersionTuple(10U, 12U);
+  case llvm::Triple::IOS:
+  case llvm::Triple::TvOS: // Earliest supporting version is 10.0.0.
+    return llvm::VersionTuple(10U);
+  case llvm::Triple::WatchOS: // Earliest supporting version is 3.0.0.
+    return llvm::VersionTuple(3U);
+  }
+
+  llvm_unreachable("Unexpected OS");
+}
+
+bool Darwin::isSizedDeallocationUnavailable() const {
+  llvm::Triple::OSType OS;
+
+  if (isTargetMacCatalyst())
+    return TargetVersion < sizedDeallocMinVersion(llvm::Triple::MacOSX);
+  switch (TargetPlatform) {
+  case MacOS: // Earlier than 10.12.
+    OS = llvm::Triple::MacOSX;
+    break;
+  case IPhoneOS:
+    OS = llvm::Triple::IOS;
+    break;
+  case TvOS: // Earlier than 10.0.
+    OS = llvm::Triple::TvOS;
+    break;
+  case WatchOS: // Earlier than 3.0.
+    OS = llvm::Triple::WatchOS;
+    break;
+  case DriverKit:
+  case XROS:
+    // Always available.
+    return false;
+  }
+
+  return TargetVersion < sizedDeallocMinVersion(OS);
+}
+
+void Darwin::addClangTargetOptions(
+    const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args,
+    Action::OffloadKind DeviceOffloadKind) const {
   // Pass "-faligned-alloc-unavailable" only when the user hasn't manually
   // enabled or disabled aligned allocations.
   if (!DriverArgs.hasArgNoClaim(options::OPT_faligned_allocation,
@@ -2922,6 +2967,13 @@ void Darwin::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
       isAlignedAllocationUnavailable())
     CC1Args.push_back("-faligned-alloc-unavailable");
 
+  // Pass "-fno-sized-deallocation" only when the user hasn't manually enabled
+  // or disabled sized deallocations.
+  if (!DriverArgs.hasArgNoClaim(options::OPT_fsized_deallocation,
+                                options::OPT_fno_sized_deallocation) &&
+      isSizedDeallocationUnavailable())
+    CC1Args.push_back("-fno-sized-deallocation");
+
   addClangCC1ASTargetOptions(DriverArgs, CC1Args);
 
   // Enable compatibility mode for NSItemProviderCompletionHandler in
diff --git a/clang/lib/Driver/ToolChains/Darwin.h b/clang/lib/Driver/ToolChains/Darwin.h
index 10d4b69e5d5f..b45279ecedeb 100644
--- a/clang/lib/Driver/ToolChains/Darwin.h
+++ b/clang/lib/Driver/ToolChains/Darwin.h
@@ -511,6 +511,10 @@ protected:
   /// targeting.
   bool isAlignedAllocationUnavailable() const;
 
+  /// Return true if c++14 sized deallocation functions are not implemented in
+  /// the c++ standard library of the deployment target we are targeting.
+  bool isSizedDeallocationUnavailable() const;
+
   void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
                              llvm::opt::ArgStringList &CC1Args,
                              Action::OffloadKind DeviceOffloadKind) const override;
diff --git a/clang/lib/Driver/ToolChains/HIPSPV.cpp b/clang/lib/Driver/ToolChains/HIPSPV.cpp
index a144b28057f4..bdbcf9109129 100644
--- a/clang/lib/Driver/ToolChains/HIPSPV.cpp
+++ b/clang/lib/Driver/ToolChains/HIPSPV.cpp
@@ -193,7 +193,7 @@ void HIPSPVToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs,
 
   StringRef hipPath = DriverArgs.getLastArgValue(options::OPT_hip_path_EQ);
   if (hipPath.empty()) {
-    getDriver().Diag(diag::err_drv_hipspv_no_hip_path) << 1 << "'-nogpuinc'";
+    getDriver().Diag(diag::err_drv_hipspv_no_hip_path);
     return;
   }
   SmallString<128> P(hipPath);
diff --git a/clang/lib/Driver/ToolChains/ZOS.cpp b/clang/lib/Driver/ToolChains/ZOS.cpp
index d5fc7b8ef562..074e0556ecd2 100644
--- a/clang/lib/Driver/ToolChains/ZOS.cpp
+++ b/clang/lib/Driver/ToolChains/ZOS.cpp
@@ -36,6 +36,12 @@ void ZOS::addClangTargetOptions(const ArgList &DriverArgs,
   if (!DriverArgs.hasArgNoClaim(options::OPT_faligned_allocation,
                                 options::OPT_fno_aligned_allocation))
     CC1Args.push_back("-faligned-alloc-unavailable");
+
+  // Pass "-fno-sized-deallocation" only when the user hasn't manually enabled
+  // or disabled sized deallocations.
+  if (!DriverArgs.hasArgNoClaim(options::OPT_fsized_deallocation,
+                                options::OPT_fno_sized_deallocation))
+    CC1Args.push_back("-fno-sized-deallocation");
 }
 
 void zos::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
diff --git a/clang/lib/ExtractAPI/DeclarationFragments.cpp b/clang/lib/ExtractAPI/DeclarationFragments.cpp
index 98b9343924a8..8c7c0f8a1472 100644
--- a/clang/lib/ExtractAPI/DeclarationFragments.cpp
+++ b/clang/lib/ExtractAPI/DeclarationFragments.cpp
@@ -999,11 +999,11 @@ DeclarationFragmentsBuilder::getFragmentsForTemplateParameters(
             DeclarationFragments::FragmentKind::GenericParameter);
 
       if (TemplateParam->hasDefaultArgument()) {
-        DeclarationFragments After;
+        const auto Default = TemplateParam->getDefaultArgument();
         Fragments.append(" = ", DeclarationFragments::FragmentKind::Text)
-            .append(getFragmentsForType(TemplateParam->getDefaultArgument(),
-                                        TemplateParam->getASTContext(), After));
-        Fragments.append(std::move(After));
+            .append(getFragmentsForTemplateArguments(
+                {Default.getArgument()}, TemplateParam->getASTContext(),
+                {Default}));
       }
     } else if (const auto *NTP =
                    dyn_cast<NonTypeTemplateParmDecl>(ParameterArray[i])) {
@@ -1023,8 +1023,9 @@ DeclarationFragmentsBuilder::getFragmentsForTemplateParameters(
       if (NTP->hasDefaultArgument()) {
         SmallString<8> ExprStr;
         raw_svector_ostream Output(ExprStr);
-        NTP->getDefaultArgument()->printPretty(
-            Output, nullptr, NTP->getASTContext().getPrintingPolicy());
+        NTP->getDefaultArgument().getArgument().print(
+            NTP->getASTContext().getPrintingPolicy(), Output,
+            /*IncludeType=*/false);
         Fragments.append(" = ", DeclarationFragments::FragmentKind::Text)
             .append(ExprStr, DeclarationFragments::FragmentKind::Text);
       }
@@ -1083,12 +1084,22 @@ DeclarationFragmentsBuilder::getFragmentsForTemplateArguments(
 
       if (StringRef(ArgumentFragment.begin()->Spelling)
               .starts_with("type-parameter")) {
-        std::string ProperArgName = TemplateArgumentLocs.value()[i]
-                                        .getTypeSourceInfo()
-                                        ->getType()
-                                        .getAsString();
-        ArgumentFragment.begin()->Spelling.swap(ProperArgName);
+        if (TemplateArgumentLocs.has_value() &&
+            TemplateArgumentLocs->size() > i) {
+          std::string ProperArgName = TemplateArgumentLocs.value()[i]
+                                          .getTypeSourceInfo()
+                                          ->getType()
+                                          .getAsString();
+          ArgumentFragment.begin()->Spelling.swap(ProperArgName);
+        } else {
+          auto &Spelling = ArgumentFragment.begin()->Spelling;
+          Spelling.clear();
+          raw_string_ostream OutStream(Spelling);
+          CTA.print(Context.getPrintingPolicy(), OutStream, false);
+          OutStream.flush();
+        }
       }
+
       Fragments.append(std::move(ArgumentFragment));
       break;
     }
@@ -1211,9 +1222,9 @@ DeclarationFragmentsBuilder::getFragmentsForClassTemplateSpecialization(
           cast<CXXRecordDecl>(Decl)))
       .pop_back() // there is an extra semicolon now
       .append("<", DeclarationFragments::FragmentKind::Text)
-      .append(
-          getFragmentsForTemplateArguments(Decl->getTemplateArgs().asArray(),
-                                           Decl->getASTContext(), std::nullopt))
+      .append(getFragmentsForTemplateArguments(
+          Decl->getTemplateArgs().asArray(), Decl->getASTContext(),
+          Decl->getTemplateArgsAsWritten()->arguments()))
       .append(">", DeclarationFragments::FragmentKind::Text)
       .appendSemicolon();
 }
@@ -1254,9 +1265,9 @@ DeclarationFragmentsBuilder::getFragmentsForVarTemplateSpecialization(
       .append(DeclarationFragmentsBuilder::getFragmentsForVarTemplate(Decl))
       .pop_back() // there is an extra semicolon now
       .append("<", DeclarationFragments::FragmentKind::Text)
-      .append(
-          getFragmentsForTemplateArguments(Decl->getTemplateArgs().asArray(),
-                                           Decl->getASTContext(), std::nullopt))
+      .append(getFragmentsForTemplateArguments(
+          Decl->getTemplateArgs().asArray(), Decl->getASTContext(),
+          Decl->getTemplateArgsAsWritten()->arguments()))
       .append(">", DeclarationFragments::FragmentKind::Text)
       .appendSemicolon();
 }
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 3dd10f6bd2b3..b6f7567adc14 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -1410,6 +1410,13 @@ void UnwrappedLineParser::readTokenWithJavaScriptASI() {
   }
 }
 
+static bool isAltOperator(const FormatToken &Tok) {
+  return isalpha(Tok.TokenText[0]) &&
+         Tok.isOneOf(tok::ampamp, tok::ampequal, tok::amp, tok::pipe,
+                     tok::tilde, tok::exclaim, tok::exclaimequal, tok::pipepipe,
+                     tok::pipeequal, tok::caret, tok::caretequal);
+}
+
 void UnwrappedLineParser::parseStructuralElement(
     const FormatToken *OpeningBrace, IfStmtKind *IfKind,
     FormatToken **IfLeftBrace, bool *HasDoWhile, bool *HasLabel) {
@@ -1689,9 +1696,15 @@ void UnwrappedLineParser::parseStructuralElement(
     break;
   }
 
-  const bool InRequiresExpression =
-      OpeningBrace && OpeningBrace->is(TT_RequiresExpressionLBrace);
-  do {
+  for (const bool InRequiresExpression =
+           OpeningBrace && OpeningBrace->is(TT_RequiresExpressionLBrace);
+       !eof();) {
+    if (IsCpp && isAltOperator(*FormatTok)) {
+      if (auto *Next = Tokens->peekNextToken(/*SkipComment=*/true);
+          Next && Next->isBinaryOperator()) {
+        FormatTok->Tok.setKind(tok::identifier);
+      }
+    }
     const FormatToken *Previous = FormatTok->Previous;
     switch (FormatTok->Tok.getKind()) {
     case tok::at:
@@ -2122,7 +2135,7 @@ void UnwrappedLineParser::parseStructuralElement(
       nextToken();
       break;
     }
-  } while (!eof());
+  }
 }
 
 bool UnwrappedLineParser::tryToParsePropertyAccessor() {
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 68760e00003e..e8c8a5175f8f 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -432,7 +432,8 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
   //      [C++] Whether __STDC__ is predefined and if so, what its value is,
   //      are implementation-defined.
   // (Removed in C++20.)
-  if (!LangOpts.MSVCCompat && !LangOpts.TraditionalCPP)
+  if ((!LangOpts.MSVCCompat || LangOpts.MSVCEnableStdcMacro) &&
+      !LangOpts.TraditionalCPP)
     Builder.defineMacro("__STDC__");
   //   -- __STDC_HOSTED__
   //      The integer literal 1 if the implementation is a hosted
diff --git a/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp b/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp
index b76728acb907..0887b5a504f0 100644
--- a/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp
+++ b/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp
@@ -574,7 +574,7 @@ void SDiagsWriter::HandleDiagnostic(DiagnosticsEngine::Level DiagLevel,
     SmallString<256> diagnostic;
     Info.FormatDiagnostic(diagnostic);
     getMetaDiags()->Report(
-        diag::warn_fe_serialized_diag_failure_during_finalisation)
+        diag::warn_fe_serialized_diag_failure_during_finalization)
         << diagnostic;
     return;
   }
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 5f02c71f6ca5..dbff92b4e59b 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -153,12 +153,10 @@ set(x86_files
   avx512bwintrin.h
   avx512cdintrin.h
   avx512dqintrin.h
-  avx512erintrin.h
   avx512fintrin.h
   avx512fp16intrin.h
   avx512ifmaintrin.h
   avx512ifmavlintrin.h
-  avx512pfintrin.h
   avx512vbmi2intrin.h
   avx512vbmiintrin.h
   avx512vbmivlintrin.h
diff --git a/clang/lib/Headers/avx512erintrin.h b/clang/lib/Headers/avx512erintrin.h
deleted file mode 100644
index 1c5a2d2d208f..000000000000
--- a/clang/lib/Headers/avx512erintrin.h
+++ /dev/null
@@ -1,271 +0,0 @@
-/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512ERINTRIN_H
-#define __AVX512ERINTRIN_H
-
-/* exp2a23 */
-#define _mm512_exp2a23_round_pd(A, R) \
-  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
-                                       (__v8df)_mm512_setzero_pd(), \
-                                       (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_exp2a23_round_pd(S, M, A, R) \
-  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
-                                       (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                       (int)(R)))
-
-#define _mm512_maskz_exp2a23_round_pd(M, A, R) \
-  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
-                                       (__v8df)_mm512_setzero_pd(), \
-                                       (__mmask8)(M), (int)(R)))
-
-#define _mm512_exp2a23_pd(A) \
-  _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_exp2a23_pd(S, M, A) \
-  _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_exp2a23_pd(M, A) \
-  _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_exp2a23_round_ps(A, R) \
-  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
-                                      (__v16sf)_mm512_setzero_ps(), \
-                                      (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_exp2a23_round_ps(S, M, A, R) \
-  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
-                                      (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                      (int)(R)))
-
-#define _mm512_maskz_exp2a23_round_ps(M, A, R) \
-  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
-                                      (__v16sf)_mm512_setzero_ps(), \
-                                      (__mmask16)(M), (int)(R)))
-
-#define _mm512_exp2a23_ps(A) \
-  _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_exp2a23_ps(S, M, A) \
-  _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_exp2a23_ps(M, A) \
-  _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-/* rsqrt28 */
-#define _mm512_rsqrt28_round_pd(A, R) \
-  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)_mm512_setzero_pd(), \
-                                          (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \
-  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                          (int)(R)))
-
-#define _mm512_maskz_rsqrt28_round_pd(M, A, R) \
-  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)_mm512_setzero_pd(), \
-                                          (__mmask8)(M), (int)(R)))
-
-#define _mm512_rsqrt28_pd(A) \
-  _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rsqrt28_pd(S, M, A) \
-  _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rsqrt28_pd(M, A) \
-  _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_rsqrt28_round_ps(A, R) \
-  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)_mm512_setzero_ps(), \
-                                         (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \
-  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                         (int)(R)))
-
-#define _mm512_maskz_rsqrt28_round_ps(M, A, R) \
-  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)_mm512_setzero_ps(), \
-                                         (__mmask16)(M), (int)(R)))
-
-#define _mm512_rsqrt28_ps(A) \
-  _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rsqrt28_ps(S, M, A) \
-  _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rsqrt28_ps(M, A) \
-  _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rsqrt28_round_ss(A, B, R) \
-  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \
-  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)(__m128)(S), \
-                                               (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \
-  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)(M), (int)(R)))
-
-#define _mm_rsqrt28_ss(A, B) \
-  _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rsqrt28_ss(S, M, A, B) \
-  _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rsqrt28_ss(M, A, B) \
-  _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rsqrt28_round_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \
-  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)(__m128d)(S), \
-                                                (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \
-  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)(M), (int)(R)))
-
-#define _mm_rsqrt28_sd(A, B) \
-  _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rsqrt28_sd(S, M, A, B) \
-  _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rsqrt28_sd(M, A, B) \
-  _mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-/* rcp28 */
-#define _mm512_rcp28_round_pd(A, R) \
-  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)_mm512_setzero_pd(), \
-                                        (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_rcp28_round_pd(S, M, A, R) \
-  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                        (int)(R)))
-
-#define _mm512_maskz_rcp28_round_pd(M, A, R) \
-  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
-                                        (__v8df)_mm512_setzero_pd(), \
-                                        (__mmask8)(M), (int)(R)))
-
-#define _mm512_rcp28_pd(A) \
-  _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rcp28_pd(S, M, A) \
-  _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rcp28_pd(M, A) \
-  _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_rcp28_round_ps(A, R) \
-  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)_mm512_setzero_ps(), \
-                                       (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_rcp28_round_ps(S, M, A, R) \
-  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                       (int)(R)))
-
-#define _mm512_maskz_rcp28_round_ps(M, A, R) \
-  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
-                                       (__v16sf)_mm512_setzero_ps(), \
-                                       (__mmask16)(M), (int)(R)))
-
-#define _mm512_rcp28_ps(A) \
-  _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rcp28_ps(S, M, A) \
-  _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rcp28_ps(M, A) \
-  _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rcp28_round_ss(A, B, R) \
-  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
-                                             (__v4sf)(__m128)(B), \
-                                             (__v4sf)_mm_setzero_ps(), \
-                                             (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rcp28_round_ss(S, M, A, B, R) \
-  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
-                                             (__v4sf)(__m128)(B), \
-                                             (__v4sf)(__m128)(S), \
-                                             (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rcp28_round_ss(M, A, B, R) \
-  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
-                                             (__v4sf)(__m128)(B), \
-                                             (__v4sf)_mm_setzero_ps(), \
-                                             (__mmask8)(M), (int)(R)))
-
-#define _mm_rcp28_ss(A, B) \
-  _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rcp28_ss(S, M, A, B) \
-  _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rcp28_ss(M, A, B) \
-  _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rcp28_round_sd(A, B, R) \
-  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v2df)_mm_setzero_pd(), \
-                                              (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rcp28_round_sd(S, M, A, B, R) \
-  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v2df)(__m128d)(S), \
-                                              (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rcp28_round_sd(M, A, B, R) \
-  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v2df)_mm_setzero_pd(), \
-                                              (__mmask8)(M), (int)(R)))
-
-#define _mm_rcp28_sd(A, B) \
-  _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rcp28_sd(S, M, A, B) \
-  _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rcp28_sd(M, A, B) \
-  _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#endif /* __AVX512ERINTRIN_H */
diff --git a/clang/lib/Headers/avx512pfintrin.h b/clang/lib/Headers/avx512pfintrin.h
deleted file mode 100644
index f853be021a2d..000000000000
--- a/clang/lib/Headers/avx512pfintrin.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*===------------- avx512pfintrin.h - PF intrinsics ------------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512PFINTRIN_H
-#define __AVX512PFINTRIN_H
-
-#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \
-  __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
-                             (void const *)(addr), (int)(scale), \
-                             (int)(hint))
-
-#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) \
-  __builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
-                             (void const *)(addr), (int)(scale), \
-                             (int)(hint))
-
-#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) \
-  __builtin_ia32_gatherpfdps((__mmask16)(mask), \
-                             (__v16si)(__m512i)(index), (void const *)(addr), \
-                             (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) \
-  __builtin_ia32_gatherpfdps((__mmask16) -1, \
-                             (__v16si)(__m512i)(index), (void const *)(addr), \
-                             (int)(scale), (int)(hint))
-
-#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) \
-  __builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                             (void const *)(addr), (int)(scale), \
-                             (int)(hint))
-
-#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) \
-  __builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
-                             (void const *)(addr), (int)(scale), \
-                             (int)(hint))
-
-#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) \
-  __builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                             (void const *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) \
-  __builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
-                             (void const *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) \
-  __builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
-                              (void *)(addr), (int)(scale), \
-                              (int)(hint))
-
-#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) \
-  __builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
-                              (void *)(addr), (int)(scale), \
-                              (int)(hint))
-
-#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) \
-  __builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) \
-  __builtin_ia32_scatterpfdps((__mmask16)(mask), \
-                              (__v16si)(__m512i)(index), (void *)(addr), \
-                              (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) \
-  __builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), \
-                              (int)(hint))
-
-#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) \
-  __builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), \
-                              (int)(hint))
-
-#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) \
-  __builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) \
-  __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                              (void *)(addr), (int)(scale), (int)(hint))
-
-#endif
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 508696d3725b..cd6cf09b90ca 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -151,10 +151,6 @@
 #include <avx512vldqintrin.h>
 #endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512ER__)
-#include <avx512erintrin.h>
-#endif
-
 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512IFMA__)
 #include <avx512ifmaintrin.h>
 #endif
@@ -186,10 +182,6 @@
 #include <avx512vlvbmi2intrin.h>
 #endif
 
-#if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512PF__)
-#include <avx512pfintrin.h>
-#endif
-
 #if !defined(__SCE__) || __has_feature(modules) || defined(__AVX512FP16__)
 #include <avx512fp16intrin.h>
 #endif
diff --git a/clang/lib/Headers/intrin.h b/clang/lib/Headers/intrin.h
index 7eb6dceaabfa..5ceb986a1f65 100644
--- a/clang/lib/Headers/intrin.h
+++ b/clang/lib/Headers/intrin.h
@@ -378,7 +378,7 @@ unsigned int _CountLeadingSigns64(__int64);
 unsigned int _CountOneBits(unsigned long);
 unsigned int _CountOneBits64(unsigned __int64);
 
-void __cdecl __prefetch(void *);
+void __cdecl __prefetch(const void *);
 #endif
 
 /*----------------------------------------------------------------------------*\
diff --git a/clang/lib/Headers/module.modulemap b/clang/lib/Headers/module.modulemap
index 4abfd1d98a63..9ffc249c8d1a 100644
--- a/clang/lib/Headers/module.modulemap
+++ b/clang/lib/Headers/module.modulemap
@@ -44,7 +44,6 @@ module _Builtin_intrinsics [system] [extern_c] {
     textual header "avxintrin.h"
     textual header "avx2intrin.h"
     textual header "avx512fintrin.h"
-    textual header "avx512erintrin.h"
     textual header "fmaintrin.h"
 
     header "x86intrin.h"
diff --git a/clang/lib/Index/IndexDecl.cpp b/clang/lib/Index/IndexDecl.cpp
index 8eb88f5a1e94..a7fa6c5e6898 100644
--- a/clang/lib/Index/IndexDecl.cpp
+++ b/clang/lib/Index/IndexDecl.cpp
@@ -703,14 +703,16 @@ public:
         IndexCtx.handleDecl(TP);
       if (const auto *TTP = dyn_cast<TemplateTypeParmDecl>(TP)) {
         if (TTP->hasDefaultArgument())
-          IndexCtx.indexTypeSourceInfo(TTP->getDefaultArgumentInfo(), Parent);
+          handleTemplateArgumentLoc(TTP->getDefaultArgument(), Parent,
+                                    TP->getLexicalDeclContext());
         if (auto *C = TTP->getTypeConstraint())
           IndexCtx.handleReference(C->getNamedConcept(), C->getConceptNameLoc(),
                                    Parent, TTP->getLexicalDeclContext());
       } else if (const auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(TP)) {
         IndexCtx.indexTypeSourceInfo(NTTP->getTypeSourceInfo(), Parent);
         if (NTTP->hasDefaultArgument())
-          IndexCtx.indexBody(NTTP->getDefaultArgument(), Parent);
+          handleTemplateArgumentLoc(NTTP->getDefaultArgument(), Parent,
+                                    TP->getLexicalDeclContext());
       } else if (const auto *TTPD = dyn_cast<TemplateTemplateParmDecl>(TP)) {
         if (TTPD->hasDefaultArgument())
           handleTemplateArgumentLoc(TTPD->getDefaultArgument(), Parent,
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 445d3fd66e38..86e8a6b7ee0e 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -666,6 +666,9 @@ void Parser::ParseGNUAttributeArgs(
     ParseBoundsAttribute(*AttrName, AttrNameLoc, Attrs, ScopeName, ScopeLoc,
                          Form);
     return;
+  } else if (AttrKind == ParsedAttr::AT_CXXAssume) {
+    ParseCXXAssumeAttributeArg(Attrs, AttrName, AttrNameLoc, EndLoc, Form);
+    return;
   }
 
   // These may refer to the function arguments, but need to be parsed early to
@@ -720,6 +723,10 @@ unsigned Parser::ParseClangAttributeArgs(
     ParseTypeTagForDatatypeAttribute(*AttrName, AttrNameLoc, Attrs, EndLoc,
                                      ScopeName, ScopeLoc, Form);
     break;
+
+  case ParsedAttr::AT_CXXAssume:
+    ParseCXXAssumeAttributeArg(Attrs, AttrName, AttrNameLoc, EndLoc, Form);
+    break;
   }
   return !Attrs.empty() ? Attrs.begin()->getNumArgs() : 0;
 }
@@ -1923,9 +1930,8 @@ void Parser::DiagnoseCXX11AttributeExtension(ParsedAttributes &Attrs) {
 // variable.
 // This function moves attributes that should apply to the type off DS to Attrs.
 void Parser::stripTypeAttributesOffDeclSpec(ParsedAttributes &Attrs,
-                                            DeclSpec &DS,
-                                            Sema::TagUseKind TUK) {
-  if (TUK == Sema::TUK_Reference)
+                                            DeclSpec &DS, TagUseKind TUK) {
+  if (TUK == TagUseKind::Reference)
     return;
 
   llvm::SmallVector<ParsedAttr *, 1> ToBeMoved;
@@ -3306,6 +3312,19 @@ void Parser::ParseAlignmentSpecifier(ParsedAttributes &Attrs,
   }
 }
 
+void Parser::DistributeCLateParsedAttrs(Decl *Dcl,
+                                        LateParsedAttrList *LateAttrs) {
+  if (!LateAttrs)
+    return;
+
+  if (Dcl) {
+    for (auto *LateAttr : *LateAttrs) {
+      if (LateAttr->Decls.empty())
+        LateAttr->addDecl(Dcl);
+    }
+  }
+}
+
 /// Bounds attributes (e.g., counted_by):
 ///   AttrName '(' expression ')'
 void Parser::ParseBoundsAttribute(IdentifierInfo &AttrName,
@@ -4843,13 +4862,14 @@ static void DiagnoseCountAttributedTypeInUnnamedAnon(ParsingDeclSpec &DS,
 ///
 void Parser::ParseStructDeclaration(
     ParsingDeclSpec &DS,
-    llvm::function_ref<void(ParsingFieldDeclarator &)> FieldsCallback) {
+    llvm::function_ref<Decl *(ParsingFieldDeclarator &)> FieldsCallback,
+    LateParsedAttrList *LateFieldAttrs) {
 
   if (Tok.is(tok::kw___extension__)) {
     // __extension__ silences extension warnings in the subexpression.
     ExtensionRAIIObject O(Diags);  // Use RAII to do this.
     ConsumeToken();
-    return ParseStructDeclaration(DS, FieldsCallback);
+    return ParseStructDeclaration(DS, FieldsCallback, LateFieldAttrs);
   }
 
   // Parse leading attributes.
@@ -4914,10 +4934,12 @@ void Parser::ParseStructDeclaration(
     }
 
     // If attributes exist after the declarator, parse them.
-    MaybeParseGNUAttributes(DeclaratorInfo.D);
+    MaybeParseGNUAttributes(DeclaratorInfo.D, LateFieldAttrs);
 
     // We're done with this declarator;  invoke the callback.
-    FieldsCallback(DeclaratorInfo);
+    Decl *Field = FieldsCallback(DeclaratorInfo);
+    if (Field)
+      DistributeCLateParsedAttrs(Field, LateFieldAttrs);
 
     // If we don't have a comma, it is either the end of the list (a ';')
     // or an error, bail out.
@@ -4928,6 +4950,73 @@ void Parser::ParseStructDeclaration(
   }
 }
 
+// TODO: All callers of this function should be moved to
+// `Parser::ParseLexedAttributeList`.
+void Parser::ParseLexedCAttributeList(LateParsedAttrList &LAs, bool EnterScope,
+                                      ParsedAttributes *OutAttrs) {
+  assert(LAs.parseSoon() &&
+         "Attribute list should be marked for immediate parsing.");
+  for (auto *LA : LAs) {
+    ParseLexedCAttribute(*LA, EnterScope, OutAttrs);
+    delete LA;
+  }
+  LAs.clear();
+}
+
+/// Finish parsing an attribute for which parsing was delayed.
+/// This will be called at the end of parsing a class declaration
+/// for each LateParsedAttribute. We consume the saved tokens and
+/// create an attribute with the arguments filled in. We add this
+/// to the Attribute list for the decl.
+void Parser::ParseLexedCAttribute(LateParsedAttribute &LA, bool EnterScope,
+                                  ParsedAttributes *OutAttrs) {
+  // Create a fake EOF so that attribute parsing won't go off the end of the
+  // attribute.
+  Token AttrEnd;
+  AttrEnd.startToken();
+  AttrEnd.setKind(tok::eof);
+  AttrEnd.setLocation(Tok.getLocation());
+  AttrEnd.setEofData(LA.Toks.data());
+  LA.Toks.push_back(AttrEnd);
+
+  // Append the current token at the end of the new token stream so that it
+  // doesn't get lost.
+  LA.Toks.push_back(Tok);
+  PP.EnterTokenStream(LA.Toks, /*DisableMacroExpansion=*/true,
+                      /*IsReinject=*/true);
+  // Drop the current token and bring the first cached one. It's the same token
+  // as when we entered this function.
+  ConsumeAnyToken(/*ConsumeCodeCompletionTok=*/true);
+
+  // TODO: Use `EnterScope`
+  (void)EnterScope;
+
+  ParsedAttributes Attrs(AttrFactory);
+
+  assert(LA.Decls.size() <= 1 &&
+         "late field attribute expects to have at most one declaration.");
+
+  // Dispatch based on the attribute and parse it
+  ParseGNUAttributeArgs(&LA.AttrName, LA.AttrNameLoc, Attrs, nullptr, nullptr,
+                        SourceLocation(), ParsedAttr::Form::GNU(), nullptr);
+
+  for (auto *D : LA.Decls)
+    Actions.ActOnFinishDelayedAttribute(getCurScope(), D, Attrs);
+
+  // Due to a parsing error, we either went over the cached tokens or
+  // there are still cached tokens left, so we skip the leftover tokens.
+  while (Tok.isNot(tok::eof))
+    ConsumeAnyToken();
+
+  // Consume the fake EOF token if it's there
+  if (Tok.is(tok::eof) && Tok.getEofData() == AttrEnd.getEofData())
+    ConsumeAnyToken();
+
+  if (OutAttrs) {
+    OutAttrs->takeAllFrom(Attrs);
+  }
+}
+
 /// ParseStructUnionBody
 ///       struct-contents:
 ///         struct-declaration-list
@@ -4951,6 +5040,11 @@ void Parser::ParseStructUnionBody(SourceLocation RecordLoc,
   ParseScope StructScope(this, Scope::ClassScope|Scope::DeclScope);
   Actions.ActOnTagStartDefinition(getCurScope(), TagDecl);
 
+  // `LateAttrParseExperimentalExtOnly=true` requests that only attributes
+  // marked with `LateAttrParseExperimentalExt` are late parsed.
+  LateParsedAttrList LateFieldAttrs(/*PSoon=*/true,
+                                    /*LateAttrParseExperimentalExtOnly=*/true);
+
   // While we still have something to read, read the declarations in the struct.
   while (!tryParseMisplacedModuleImport() && Tok.isNot(tok::r_brace) &&
          Tok.isNot(tok::eof)) {
@@ -5001,18 +5095,19 @@ void Parser::ParseStructUnionBody(SourceLocation RecordLoc,
     }
 
     if (!Tok.is(tok::at)) {
-      auto CFieldCallback = [&](ParsingFieldDeclarator &FD) {
+      auto CFieldCallback = [&](ParsingFieldDeclarator &FD) -> Decl * {
         // Install the declarator into the current TagDecl.
         Decl *Field =
             Actions.ActOnField(getCurScope(), TagDecl,
                                FD.D.getDeclSpec().getSourceRange().getBegin(),
                                FD.D, FD.BitfieldSize);
         FD.complete(Field);
+        return Field;
       };
 
       // Parse all the comma separated declarators.
       ParsingDeclSpec DS(*this);
-      ParseStructDeclaration(DS, CFieldCallback);
+      ParseStructDeclaration(DS, CFieldCallback, &LateFieldAttrs);
     } else { // Handle @defs
       ConsumeToken();
       if (!Tok.isObjCAtKeyword(tok::objc_defs)) {
@@ -5053,7 +5148,10 @@ void Parser::ParseStructUnionBody(SourceLocation RecordLoc,
 
   ParsedAttributes attrs(AttrFactory);
   // If attributes exist after struct contents, parse them.
-  MaybeParseGNUAttributes(attrs);
+  MaybeParseGNUAttributes(attrs, &LateFieldAttrs);
+
+  // Late parse field attributes if necessary.
+  ParseLexedCAttributeList(LateFieldAttrs, /*EnterScope=*/false);
 
   SmallVector<Decl *, 32> FieldDecls(TagDecl->fields());
 
@@ -5287,9 +5385,9 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS,
   // enum foo {..};  void bar() { enum foo; }    <- new foo in bar.
   // enum foo {..};  void bar() { enum foo x; }  <- use of old foo.
   //
-  Sema::TagUseKind TUK;
+  TagUseKind TUK;
   if (AllowEnumSpecifier == AllowDefiningTypeSpec::No)
-    TUK = Sema::TUK_Reference;
+    TUK = TagUseKind::Reference;
   else if (Tok.is(tok::l_brace)) {
     if (DS.isFriendSpecified()) {
       Diag(Tok.getLocation(), diag::err_friend_decl_defines_type)
@@ -5301,9 +5399,9 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS,
       ScopedEnumKWLoc = SourceLocation();
       IsScopedUsingClassTag = false;
       BaseType = TypeResult();
-      TUK = Sema::TUK_Friend;
+      TUK = TagUseKind::Friend;
     } else {
-      TUK = Sema::TUK_Definition;
+      TUK = TagUseKind::Definition;
     }
   } else if (!isTypeSpecifier(DSC) &&
              (Tok.is(tok::semi) ||
@@ -5312,7 +5410,7 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS,
     // An opaque-enum-declaration is required to be standalone (no preceding or
     // following tokens in the declaration). Sema enforces this separately by
     // diagnosing anything else in the DeclSpec.
-    TUK = DS.isFriendSpecified() ? Sema::TUK_Friend : Sema::TUK_Declaration;
+    TUK = DS.isFriendSpecified() ? TagUseKind::Friend : TagUseKind::Declaration;
     if (Tok.isNot(tok::semi)) {
       // A semicolon was missing after this declaration. Diagnose and recover.
       ExpectAndConsume(tok::semi, diag::err_expected_after, "enum");
@@ -5320,21 +5418,21 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS,
       Tok.setKind(tok::semi);
     }
   } else {
-    TUK = Sema::TUK_Reference;
+    TUK = TagUseKind::Reference;
   }
 
   bool IsElaboratedTypeSpecifier =
-      TUK == Sema::TUK_Reference || TUK == Sema::TUK_Friend;
+      TUK == TagUseKind::Reference || TUK == TagUseKind::Friend;
 
   // If this is an elaborated type specifier nested in a larger declaration,
   // and we delayed diagnostics before, just merge them into the current pool.
-  if (TUK == Sema::TUK_Reference && shouldDelayDiagsInTag) {
+  if (TUK == TagUseKind::Reference && shouldDelayDiagsInTag) {
     diagsFromTag.redelay();
   }
 
   MultiTemplateParamsArg TParams;
   if (TemplateInfo.Kind != ParsedTemplateInfo::NonTemplate &&
-      TUK != Sema::TUK_Reference) {
+      TUK != TagUseKind::Reference) {
     if (!getLangOpts().CPlusPlus11 || !SS.isSet()) {
       // Skip the rest of this declarator, up until the comma or semicolon.
       Diag(Tok, diag::err_enum_template);
@@ -5355,7 +5453,7 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS,
     SS.setTemplateParamLists(TParams);
   }
 
-  if (!Name && TUK != Sema::TUK_Definition) {
+  if (!Name && TUK != TagUseKind::Definition) {
     Diag(Tok, diag::err_enumerator_unnamed_no_def);
 
     DS.SetTypeSpecError();
@@ -5388,7 +5486,7 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS,
   stripTypeAttributesOffDeclSpec(attrs, DS, TUK);
 
   SkipBodyInfo SkipBody;
-  if (!Name && TUK == Sema::TUK_Definition && Tok.is(tok::l_brace) &&
+  if (!Name && TUK == TagUseKind::Definition && Tok.is(tok::l_brace) &&
       NextToken().is(tok::identifier))
     SkipBody = Actions.shouldSkipAnonEnumBody(getCurScope(),
                                               NextToken().getIdentifierInfo(),
@@ -5409,7 +5507,7 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS,
                     OffsetOfState, &SkipBody).get();
 
   if (SkipBody.ShouldSkip) {
-    assert(TUK == Sema::TUK_Definition && "can only skip a definition");
+    assert(TUK == TagUseKind::Definition && "can only skip a definition");
 
     BalancedDelimiterTracker T(*this, tok::l_brace);
     T.consumeOpen();
@@ -5451,7 +5549,7 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS,
   if (!TagDecl) {
     // The action failed to produce an enumeration tag. If this is a
     // definition, consume the entire definition.
-    if (Tok.is(tok::l_brace) && TUK != Sema::TUK_Reference) {
+    if (Tok.is(tok::l_brace) && TUK != TagUseKind::Reference) {
       ConsumeBrace();
       SkipUntil(tok::r_brace, StopAtSemi);
     }
@@ -5460,7 +5558,7 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS,
     return;
   }
 
-  if (Tok.is(tok::l_brace) && TUK == Sema::TUK_Definition) {
+  if (Tok.is(tok::l_brace) && TUK == TagUseKind::Definition) {
     Decl *D = SkipBody.CheckSameAsPrevious ? SkipBody.New : TagDecl;
     ParseEnumBody(StartLoc, D);
     if (SkipBody.CheckSameAsPrevious &&
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index 5eaec2b621e6..9a4a777f575b 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -1961,11 +1961,11 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
   MaybeParseCXX11Attributes(Attributes);
 
   const PrintingPolicy &Policy = Actions.getASTContext().getPrintingPolicy();
-  Sema::TagUseKind TUK;
+  TagUseKind TUK;
   if (isDefiningTypeSpecifierContext(DSC, getLangOpts().CPlusPlus) ==
           AllowDefiningTypeSpec::No ||
       (getLangOpts().OpenMP && OpenMPDirectiveParsing))
-    TUK = Sema::TUK_Reference;
+    TUK = TagUseKind::Reference;
   else if (Tok.is(tok::l_brace) ||
            (DSC != DeclSpecContext::DSC_association &&
             getLangOpts().CPlusPlus && Tok.is(tok::colon)) ||
@@ -1980,10 +1980,10 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
       // Skip everything up to the semicolon, so that this looks like a proper
       // friend class (or template thereof) declaration.
       SkipUntil(tok::semi, StopBeforeMatch);
-      TUK = Sema::TUK_Friend;
+      TUK = TagUseKind::Friend;
     } else {
       // Okay, this is a class definition.
-      TUK = Sema::TUK_Definition;
+      TUK = TagUseKind::Definition;
     }
   } else if (isClassCompatibleKeyword() &&
              (NextToken().is(tok::l_square) ||
@@ -2024,15 +2024,15 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
     }
 
     if (Tok.isOneOf(tok::l_brace, tok::colon))
-      TUK = Sema::TUK_Definition;
+      TUK = TagUseKind::Definition;
     else
-      TUK = Sema::TUK_Reference;
+      TUK = TagUseKind::Reference;
 
     PA.Revert();
   } else if (!isTypeSpecifier(DSC) &&
              (Tok.is(tok::semi) ||
               (Tok.isAtStartOfLine() && !isValidAfterTypeSpecifier(false)))) {
-    TUK = DS.isFriendSpecified() ? Sema::TUK_Friend : Sema::TUK_Declaration;
+    TUK = DS.isFriendSpecified() ? TagUseKind::Friend : TagUseKind::Declaration;
     if (Tok.isNot(tok::semi)) {
       const PrintingPolicy &PPol = Actions.getASTContext().getPrintingPolicy();
       // A semicolon was missing after this declaration. Diagnose and recover.
@@ -2042,11 +2042,11 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
       Tok.setKind(tok::semi);
     }
   } else
-    TUK = Sema::TUK_Reference;
+    TUK = TagUseKind::Reference;
 
   // Forbid misplaced attributes. In cases of a reference, we pass attributes
   // to caller to handle.
-  if (TUK != Sema::TUK_Reference) {
+  if (TUK != TagUseKind::Reference) {
     // If this is not a reference, then the only possible
     // valid place for C++11 attributes to appear here
     // is between class-key and class-name. If there are
@@ -2072,7 +2072,7 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
 
   if (!Name && !TemplateId &&
       (DS.getTypeSpecType() == DeclSpec::TST_error ||
-       TUK != Sema::TUK_Definition)) {
+       TUK != TagUseKind::Definition)) {
     if (DS.getTypeSpecType() != DeclSpec::TST_error) {
       // We have a declaration or reference to an anonymous class.
       Diag(StartLoc, diag::err_anon_type_definition)
@@ -2082,7 +2082,7 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
     // If we are parsing a definition and stop at a base-clause, continue on
     // until the semicolon.  Continuing from the comma will just trick us into
     // thinking we are seeing a variable declaration.
-    if (TUK == Sema::TUK_Definition && Tok.is(tok::colon))
+    if (TUK == TagUseKind::Definition && Tok.is(tok::colon))
       SkipUntil(tok::semi, StopBeforeMatch);
     else
       SkipUntil(tok::comma, StopAtSemi);
@@ -2103,7 +2103,7 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
     if (TemplateId->isInvalid()) {
       // Can't build the declaration.
     } else if (TemplateInfo.Kind == ParsedTemplateInfo::ExplicitInstantiation &&
-               TUK == Sema::TUK_Declaration) {
+               TUK == TagUseKind::Declaration) {
       // This is an explicit instantiation of a class template.
       ProhibitCXX11Attributes(attrs, diag::err_attributes_not_allowed,
                               diag::err_keyword_not_allowed,
@@ -2119,8 +2119,8 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
       // they have template headers, in which case they're ill-formed
       // (FIXME: "template <class T> friend class A<T>::B<int>;").
       // We diagnose this error in ActOnClassTemplateSpecialization.
-    } else if (TUK == Sema::TUK_Reference ||
-               (TUK == Sema::TUK_Friend &&
+    } else if (TUK == TagUseKind::Reference ||
+               (TUK == TagUseKind::Friend &&
                 TemplateInfo.Kind == ParsedTemplateInfo::NonTemplate)) {
       ProhibitCXX11Attributes(attrs, diag::err_attributes_not_allowed,
                               diag::err_keyword_not_allowed,
@@ -2145,10 +2145,10 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
         // It this is friend declaration however, since it cannot have a
         // template header, it is most likely that the user meant to
         // remove the 'template' keyword.
-        assert((TUK == Sema::TUK_Definition || TUK == Sema::TUK_Friend) &&
+        assert((TUK == TagUseKind::Definition || TUK == TagUseKind::Friend) &&
                "Expected a definition here");
 
-        if (TUK == Sema::TUK_Friend) {
+        if (TUK == TagUseKind::Friend) {
           Diag(DS.getFriendSpecLoc(), diag::err_friend_explicit_instantiation);
           TemplateParams = nullptr;
         } else {
@@ -2179,7 +2179,7 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
           &SkipBody);
     }
   } else if (TemplateInfo.Kind == ParsedTemplateInfo::ExplicitInstantiation &&
-             TUK == Sema::TUK_Declaration) {
+             TUK == TagUseKind::Declaration) {
     // Explicit instantiation of a member of a class template
     // specialization, e.g.,
     //
@@ -2190,7 +2190,7 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
     TagOrTempResult = Actions.ActOnExplicitInstantiation(
         getCurScope(), TemplateInfo.ExternLoc, TemplateInfo.TemplateLoc,
         TagType, StartLoc, SS, Name, NameLoc, attrs);
-  } else if (TUK == Sema::TUK_Friend &&
+  } else if (TUK == TagUseKind::Friend &&
              TemplateInfo.Kind != ParsedTemplateInfo::NonTemplate) {
     ProhibitCXX11Attributes(attrs, diag::err_attributes_not_allowed,
                             diag::err_keyword_not_allowed,
@@ -2202,12 +2202,12 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
         MultiTemplateParamsArg(TemplateParams ? &(*TemplateParams)[0] : nullptr,
                                TemplateParams ? TemplateParams->size() : 0));
   } else {
-    if (TUK != Sema::TUK_Declaration && TUK != Sema::TUK_Definition)
+    if (TUK != TagUseKind::Declaration && TUK != TagUseKind::Definition)
       ProhibitCXX11Attributes(attrs, diag::err_attributes_not_allowed,
                               diag::err_keyword_not_allowed,
                               /* DiagnoseEmptyAttrs=*/true);
 
-    if (TUK == Sema::TUK_Definition &&
+    if (TUK == TagUseKind::Definition &&
         TemplateInfo.Kind == ParsedTemplateInfo::ExplicitInstantiation) {
       // If the declarator-id is not a template-id, issue a diagnostic and
       // recover by ignoring the 'template' keyword.
@@ -2222,7 +2222,7 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
     // reference.  For example, we don't need the template parameters here:
     //   template <class T> class A *makeA(T t);
     MultiTemplateParamsArg TParams;
-    if (TUK != Sema::TUK_Reference && TemplateParams)
+    if (TUK != TagUseKind::Reference && TemplateParams)
       TParams =
           MultiTemplateParamsArg(&(*TemplateParams)[0], TemplateParams->size());
 
@@ -2241,7 +2241,7 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
     // If ActOnTag said the type was dependent, try again with the
     // less common call.
     if (IsDependent) {
-      assert(TUK == Sema::TUK_Reference || TUK == Sema::TUK_Friend);
+      assert(TUK == TagUseKind::Reference || TUK == TagUseKind::Friend);
       TypeResult = Actions.ActOnDependentTag(getCurScope(), TagType, TUK, SS,
                                              Name, StartLoc, NameLoc);
     }
@@ -2252,13 +2252,13 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
   // just merge them into the current pool.
   if (shouldDelayDiagsInTag) {
     diagsFromTag.done();
-    if (TUK == Sema::TUK_Reference &&
+    if (TUK == TagUseKind::Reference &&
         TemplateInfo.Kind == ParsedTemplateInfo::Template)
       diagsFromTag.redelay();
   }
 
   // If there is a body, parse it and inform the actions module.
-  if (TUK == Sema::TUK_Definition) {
+  if (TUK == TagUseKind::Definition) {
     assert(Tok.is(tok::l_brace) ||
            (getLangOpts().CPlusPlus && Tok.is(tok::colon)) ||
            isClassCompatibleKeyword());
@@ -2316,7 +2316,7 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
   //
   // After a type-specifier, we don't expect a semicolon. This only happens in
   // C, since definitions are not permitted in this context in C++.
-  if (TUK == Sema::TUK_Definition &&
+  if (TUK == TagUseKind::Definition &&
       (getLangOpts().CPlusPlus || !isTypeSpecifier(DSC)) &&
       (TemplateInfo.Kind || !isValidAfterTypeSpecifier(false))) {
     if (Tok.isNot(tok::semi)) {
@@ -4560,7 +4560,8 @@ static bool IsBuiltInOrStandardCXX11Attribute(IdentifierInfo *AttrName,
 bool Parser::ParseCXXAssumeAttributeArg(ParsedAttributes &Attrs,
                                         IdentifierInfo *AttrName,
                                         SourceLocation AttrNameLoc,
-                                        SourceLocation *EndLoc) {
+                                        SourceLocation *EndLoc,
+                                        ParsedAttr::Form Form) {
   assert(Tok.is(tok::l_paren) && "Not a C++11 attribute argument list");
   BalancedDelimiterTracker T(*this, tok::l_paren);
   T.consumeOpen();
@@ -4603,7 +4604,7 @@ bool Parser::ParseCXXAssumeAttributeArg(ParsedAttributes &Attrs,
   auto RParen = Tok.getLocation();
   T.consumeClose();
   Attrs.addNew(AttrName, SourceRange(AttrNameLoc, RParen), nullptr,
-               SourceLocation(), &Assumption, 1, ParsedAttr::Form::CXX11());
+               SourceLocation(), &Assumption, 1, Form);
 
   if (EndLoc)
     *EndLoc = RParen;
@@ -4683,7 +4684,7 @@ bool Parser::ParseCXX11AttributeArgs(
                                       ScopeName, ScopeLoc, Form);
   // So does C++23's assume() attribute.
   else if (!ScopeName && AttrName->isStr("assume")) {
-    if (ParseCXXAssumeAttributeArg(Attrs, AttrName, AttrNameLoc, EndLoc))
+    if (ParseCXXAssumeAttributeArg(Attrs, AttrName, AttrNameLoc, EndLoc, Form))
       return true;
     NumArgs = 1;
   } else
diff --git a/clang/lib/Parse/ParseObjc.cpp b/clang/lib/Parse/ParseObjc.cpp
index 89f4acbd25e4..6a2088a73c55 100644
--- a/clang/lib/Parse/ParseObjc.cpp
+++ b/clang/lib/Parse/ParseObjc.cpp
@@ -780,16 +780,16 @@ void Parser::ParseObjCInterfaceDeclList(tok::ObjCKeywordKind contextKey,
       }
 
       bool addedToDeclSpec = false;
-      auto ObjCPropertyCallback = [&](ParsingFieldDeclarator &FD) {
+      auto ObjCPropertyCallback = [&](ParsingFieldDeclarator &FD) -> Decl * {
         if (FD.D.getIdentifier() == nullptr) {
           Diag(AtLoc, diag::err_objc_property_requires_field_name)
               << FD.D.getSourceRange();
-          return;
+          return nullptr;
         }
         if (FD.BitfieldSize) {
           Diag(AtLoc, diag::err_objc_property_bitfield)
               << FD.D.getSourceRange();
-          return;
+          return nullptr;
         }
 
         // Map a nullability property attribute to a context-sensitive keyword
@@ -818,6 +818,7 @@ void Parser::ParseObjCInterfaceDeclList(tok::ObjCKeywordKind contextKey,
             MethodImplKind);
 
         FD.complete(Property);
+        return Property;
       };
 
       // Parse all the comma separated declarators.
@@ -2013,7 +2014,7 @@ void Parser::ParseObjCClassInstanceVariables(ObjCContainerDecl *interfaceDecl,
       continue;
     }
 
-    auto ObjCIvarCallback = [&](ParsingFieldDeclarator &FD) {
+    auto ObjCIvarCallback = [&](ParsingFieldDeclarator &FD) -> Decl * {
       assert(getObjCDeclContext() == interfaceDecl &&
              "Ivar should have interfaceDecl as its decl context");
       // Install the declarator into the interface decl.
@@ -2024,6 +2025,7 @@ void Parser::ParseObjCClassInstanceVariables(ObjCContainerDecl *interfaceDecl,
       if (Field)
         AllIvarDecls.push_back(Field);
       FD.complete(Field);
+      return Field;
     };
 
     // Parse all the comma separated declarators.
diff --git a/clang/lib/Parse/ParsePragma.cpp b/clang/lib/Parse/ParsePragma.cpp
index 643fdac287d1..cc6f18b5b319 100644
--- a/clang/lib/Parse/ParsePragma.cpp
+++ b/clang/lib/Parse/ParsePragma.cpp
@@ -23,6 +23,7 @@
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaCodeCompletion.h"
+#include "clang/Sema/SemaRISCV.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include <optional>
@@ -4154,7 +4155,7 @@ void PragmaRISCVHandler::HandlePragma(Preprocessor &PP,
   }
 
   if (II->isStr("vector"))
-    Actions.DeclareRISCVVBuiltins = true;
+    Actions.RISCV().DeclareRVVBuiltins = true;
   else if (II->isStr("sifive_vector"))
-    Actions.DeclareRISCVSiFiveVectorBuiltins = true;
+    Actions.RISCV().DeclareSiFiveVectorBuiltins = true;
 }
diff --git a/clang/lib/Sema/CMakeLists.txt b/clang/lib/Sema/CMakeLists.txt
index 6b7742cae2db..fe6471c81ff0 100644
--- a/clang/lib/Sema/CMakeLists.txt
+++ b/clang/lib/Sema/CMakeLists.txt
@@ -71,6 +71,7 @@ add_clang_library(clangSema
   SemaTemplateInstantiateDecl.cpp
   SemaTemplateVariadic.cpp
   SemaType.cpp
+  SemaX86.cpp
   TypeLocBuilder.cpp
 
   DEPENDS
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index bb283c54b3d2..a2b29a7bdf50 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -308,17 +308,18 @@ struct BuiltinTypeDeclBuilder {
     return *this;
   }
 
-  TemplateParameterListBuilder addTemplateArgumentList();
-  BuiltinTypeDeclBuilder &addSimpleTemplateParams(ArrayRef<StringRef> Names);
+  TemplateParameterListBuilder addTemplateArgumentList(Sema &S);
+  BuiltinTypeDeclBuilder &addSimpleTemplateParams(Sema &S,
+                                                  ArrayRef<StringRef> Names);
 };
 
 struct TemplateParameterListBuilder {
   BuiltinTypeDeclBuilder &Builder;
-  ASTContext &AST;
+  Sema &S;
   llvm::SmallVector<NamedDecl *> Params;
 
-  TemplateParameterListBuilder(BuiltinTypeDeclBuilder &RB)
-      : Builder(RB), AST(RB.Record->getASTContext()) {}
+  TemplateParameterListBuilder(Sema &S, BuiltinTypeDeclBuilder &RB)
+      : Builder(RB), S(S) {}
 
   ~TemplateParameterListBuilder() { finalizeTemplateArgs(); }
 
@@ -328,12 +329,15 @@ struct TemplateParameterListBuilder {
       return *this;
     unsigned Position = static_cast<unsigned>(Params.size());
     auto *Decl = TemplateTypeParmDecl::Create(
-        AST, Builder.Record->getDeclContext(), SourceLocation(),
+        S.Context, Builder.Record->getDeclContext(), SourceLocation(),
         SourceLocation(), /* TemplateDepth */ 0, Position,
-        &AST.Idents.get(Name, tok::TokenKind::identifier), /* Typename */ false,
+        &S.Context.Idents.get(Name, tok::TokenKind::identifier),
+        /* Typename */ false,
         /* ParameterPack */ false);
     if (!DefaultValue.isNull())
-      Decl->setDefaultArgument(AST.getTrivialTypeSourceInfo(DefaultValue));
+      Decl->setDefaultArgument(
+          S.Context, S.getTrivialTemplateArgumentLoc(DefaultValue, QualType(),
+                                                     SourceLocation()));
 
     Params.emplace_back(Decl);
     return *this;
@@ -342,11 +346,11 @@ struct TemplateParameterListBuilder {
   BuiltinTypeDeclBuilder &finalizeTemplateArgs() {
     if (Params.empty())
       return Builder;
-    auto *ParamList =
-        TemplateParameterList::Create(AST, SourceLocation(), SourceLocation(),
-                                      Params, SourceLocation(), nullptr);
+    auto *ParamList = TemplateParameterList::Create(S.Context, SourceLocation(),
+                                                    SourceLocation(), Params,
+                                                    SourceLocation(), nullptr);
     Builder.Template = ClassTemplateDecl::Create(
-        AST, Builder.Record->getDeclContext(), SourceLocation(),
+        S.Context, Builder.Record->getDeclContext(), SourceLocation(),
         DeclarationName(Builder.Record->getIdentifier()), ParamList,
         Builder.Record);
     Builder.Record->setDescribedClassTemplate(Builder.Template);
@@ -359,20 +363,22 @@ struct TemplateParameterListBuilder {
     Params.clear();
 
     QualType T = Builder.Template->getInjectedClassNameSpecialization();
-    T = AST.getInjectedClassNameType(Builder.Record, T);
+    T = S.Context.getInjectedClassNameType(Builder.Record, T);
 
     return Builder;
   }
 };
 } // namespace
 
-TemplateParameterListBuilder BuiltinTypeDeclBuilder::addTemplateArgumentList() {
-  return TemplateParameterListBuilder(*this);
+TemplateParameterListBuilder
+BuiltinTypeDeclBuilder::addTemplateArgumentList(Sema &S) {
+  return TemplateParameterListBuilder(S, *this);
 }
 
 BuiltinTypeDeclBuilder &
-BuiltinTypeDeclBuilder::addSimpleTemplateParams(ArrayRef<StringRef> Names) {
-  TemplateParameterListBuilder Builder = this->addTemplateArgumentList();
+BuiltinTypeDeclBuilder::addSimpleTemplateParams(Sema &S,
+                                                ArrayRef<StringRef> Names) {
+  TemplateParameterListBuilder Builder = this->addTemplateArgumentList(S);
   for (StringRef Name : Names)
     Builder.addTypeParameter(Name);
   return Builder.finalizeTemplateArgs();
@@ -426,7 +432,9 @@ void HLSLExternalSemaSource::defineHLSLVectorAlias() {
   auto *TypeParam = TemplateTypeParmDecl::Create(
       AST, HLSLNamespace, SourceLocation(), SourceLocation(), 0, 0,
       &AST.Idents.get("element", tok::TokenKind::identifier), false, false);
-  TypeParam->setDefaultArgument(AST.getTrivialTypeSourceInfo(AST.FloatTy));
+  TypeParam->setDefaultArgument(
+      AST, SemaPtr->getTrivialTemplateArgumentLoc(
+               TemplateArgument(AST.FloatTy), QualType(), SourceLocation()));
 
   TemplateParams.emplace_back(TypeParam);
 
@@ -434,10 +442,12 @@ void HLSLExternalSemaSource::defineHLSLVectorAlias() {
       AST, HLSLNamespace, SourceLocation(), SourceLocation(), 0, 1,
       &AST.Idents.get("element_count", tok::TokenKind::identifier), AST.IntTy,
       false, AST.getTrivialTypeSourceInfo(AST.IntTy));
-  Expr *LiteralExpr =
-      IntegerLiteral::Create(AST, llvm::APInt(AST.getIntWidth(AST.IntTy), 4),
-                             AST.IntTy, SourceLocation());
-  SizeParam->setDefaultArgument(LiteralExpr);
+  llvm::APInt Val(AST.getIntWidth(AST.IntTy), 4);
+  TemplateArgument Default(AST, llvm::APSInt(std::move(Val)), AST.IntTy,
+                           /*IsDefaulted=*/true);
+  SizeParam->setDefaultArgument(
+      AST, SemaPtr->getTrivialTemplateArgumentLoc(Default, AST.IntTy,
+                                                  SourceLocation(), SizeParam));
   TemplateParams.emplace_back(SizeParam);
 
   auto *ParamList =
@@ -492,7 +502,7 @@ static BuiltinTypeDeclBuilder setupBufferType(CXXRecordDecl *Decl, Sema &S,
 void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
   CXXRecordDecl *Decl;
   Decl = BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "RWBuffer")
-             .addSimpleTemplateParams({"element_type"})
+             .addSimpleTemplateParams(*SemaPtr, {"element_type"})
              .Record;
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV,
@@ -503,7 +513,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
 
   Decl =
       BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "RasterizerOrderedBuffer")
-          .addSimpleTemplateParams({"element_type"})
+          .addSimpleTemplateParams(*SemaPtr, {"element_type"})
           .Record;
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV,
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 2c5774da3f66..d1fb21bb1ae1 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -50,7 +50,9 @@
 #include "clang/Sema/SemaOpenACC.h"
 #include "clang/Sema/SemaOpenMP.h"
 #include "clang/Sema/SemaPseudoObject.h"
+#include "clang/Sema/SemaRISCV.h"
 #include "clang/Sema/SemaSYCL.h"
+#include "clang/Sema/SemaX86.h"
 #include "clang/Sema/TemplateDeduction.h"
 #include "clang/Sema/TemplateInstCallback.h"
 #include "clang/Sema/TypoCorrection.h"
@@ -212,7 +214,9 @@ Sema::Sema(Preprocessor &pp, ASTContext &ctxt, ASTConsumer &consumer,
       OpenACCPtr(std::make_unique<SemaOpenACC>(*this)),
       OpenMPPtr(std::make_unique<SemaOpenMP>(*this)),
       PseudoObjectPtr(std::make_unique<SemaPseudoObject>(*this)),
+      RISCVPtr(std::make_unique<SemaRISCV>(*this)),
       SYCLPtr(std::make_unique<SemaSYCL>(*this)),
+      X86Ptr(std::make_unique<SemaX86>(*this)),
       MSPointerToMemberRepresentationMethod(
           LangOpts.getMSPointerToMemberRepresentationMethod()),
       MSStructPragmaOn(false), VtorDispStack(LangOpts.getVtorDispMode()),
@@ -2051,7 +2055,7 @@ void Sema::checkTypeSupport(QualType Ty, SourceLocation Loc, ValueDecl *D) {
     if (TI.hasRISCVVTypes() && Ty->isRVVSizelessBuiltinType() && FD) {
       llvm::StringMap<bool> CallerFeatureMap;
       Context.getFunctionFeatureMap(CallerFeatureMap, FD);
-      checkRVVTypeSupport(Ty, Loc, D, CallerFeatureMap);
+      RISCV().checkRVVTypeSupport(Ty, Loc, D, CallerFeatureMap);
     }
 
     // Don't allow SVE types in functions without a SVE target.
diff --git a/clang/lib/Sema/SemaAvailability.cpp b/clang/lib/Sema/SemaAvailability.cpp
index 663b6f35b869..22f5a2f66347 100644
--- a/clang/lib/Sema/SemaAvailability.cpp
+++ b/clang/lib/Sema/SemaAvailability.cpp
@@ -987,11 +987,6 @@ void Sema::DiagnoseUnguardedAvailabilityViolations(Decl *D) {
   Stmt *Body = nullptr;
 
   if (auto *FD = D->getAsFunction()) {
-    // FIXME: We only examine the pattern decl for availability violations now,
-    // but we should also examine instantiated templates.
-    if (FD->isTemplateInstantiation())
-      return;
-
     Body = FD->getBody();
 
     if (auto *CD = dyn_cast<CXXConstructorDecl>(FD))
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index 483ec7e36eae..7db6b1dfe923 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -25,6 +25,7 @@
 #include "clang/Sema/Initialization.h"
 #include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/SemaObjC.h"
+#include "clang/Sema/SemaRISCV.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include <set>
@@ -2391,7 +2392,7 @@ static TryCastResult TryReinterpretCast(Sema &Self, ExprResult &SrcExpr,
     }
 
     // Allow bitcasting between SVE VLATs and VLSTs, and vice-versa.
-    if (Self.isValidRVVBitcast(SrcType, DestType)) {
+    if (Self.RISCV().isValidRVVBitcast(SrcType, DestType)) {
       Kind = CK_BitCast;
       return TC_Success;
     }
@@ -3002,7 +3003,7 @@ void CastOperation::CheckCStyleCast() {
 
   // Allow bitcasting between compatible RVV vector types.
   if ((SrcType->isVectorType() || DestType->isVectorType()) &&
-      Self.isValidRVVBitcast(SrcType, DestType)) {
+      Self.RISCV().isValidRVVBitcast(SrcType, DestType)) {
     Kind = CK_BitCast;
     return;
   }
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index f2dc8e9dd005..fac9a58fa268 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -63,6 +63,8 @@
 #include "clang/Sema/Sema.h"
 #include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/SemaObjC.h"
+#include "clang/Sema/SemaRISCV.h"
+#include "clang/Sema/SemaX86.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/APSInt.h"
@@ -120,13 +122,12 @@ static constexpr unsigned short combineFAPK(Sema::FormatArgumentPassingKind A,
 /// Checks that a call expression's argument count is at least the desired
 /// number. This is useful when doing custom type-checking on a variadic
 /// function. Returns true on error.
-static bool checkArgCountAtLeast(Sema &S, CallExpr *Call,
-                                 unsigned MinArgCount) {
+bool Sema::checkArgCountAtLeast(CallExpr *Call, unsigned MinArgCount) {
   unsigned ArgCount = Call->getNumArgs();
   if (ArgCount >= MinArgCount)
     return false;
 
-  return S.Diag(Call->getEndLoc(), diag::err_typecheck_call_too_few_args)
+  return Diag(Call->getEndLoc(), diag::err_typecheck_call_too_few_args)
          << 0 /*function call*/ << MinArgCount << ArgCount
          << /*is non object*/ 0 << Call->getSourceRange();
 }
@@ -134,12 +135,11 @@ static bool checkArgCountAtLeast(Sema &S, CallExpr *Call,
 /// Checks that a call expression's argument count is at most the desired
 /// number. This is useful when doing custom type-checking on a variadic
 /// function. Returns true on error.
-static bool checkArgCountAtMost(Sema &S, CallExpr *Call, unsigned MaxArgCount) {
+bool Sema::checkArgCountAtMost(CallExpr *Call, unsigned MaxArgCount) {
   unsigned ArgCount = Call->getNumArgs();
   if (ArgCount <= MaxArgCount)
     return false;
-  return S.Diag(Call->getEndLoc(),
-                diag::err_typecheck_call_too_many_args_at_most)
+  return Diag(Call->getEndLoc(), diag::err_typecheck_call_too_many_args_at_most)
          << 0 /*function call*/ << MaxArgCount << ArgCount
          << /*is non object*/ 0 << Call->getSourceRange();
 }
@@ -147,20 +147,20 @@ static bool checkArgCountAtMost(Sema &S, CallExpr *Call, unsigned MaxArgCount) {
 /// Checks that a call expression's argument count is in the desired range. This
 /// is useful when doing custom type-checking on a variadic function. Returns
 /// true on error.
-static bool checkArgCountRange(Sema &S, CallExpr *Call, unsigned MinArgCount,
-                               unsigned MaxArgCount) {
-  return checkArgCountAtLeast(S, Call, MinArgCount) ||
-         checkArgCountAtMost(S, Call, MaxArgCount);
+bool Sema::checkArgCountRange(CallExpr *Call, unsigned MinArgCount,
+                              unsigned MaxArgCount) {
+  return checkArgCountAtLeast(Call, MinArgCount) ||
+         checkArgCountAtMost(Call, MaxArgCount);
 }
 
 /// Checks that a call expression's argument count is the desired number.
 /// This is useful when doing custom type-checking.  Returns true on error.
-static bool checkArgCount(Sema &S, CallExpr *Call, unsigned DesiredArgCount) {
+bool Sema::checkArgCount(CallExpr *Call, unsigned DesiredArgCount) {
   unsigned ArgCount = Call->getNumArgs();
   if (ArgCount == DesiredArgCount)
     return false;
 
-  if (checkArgCountAtLeast(S, Call, DesiredArgCount))
+  if (checkArgCountAtLeast(Call, DesiredArgCount))
     return true;
   assert(ArgCount > DesiredArgCount && "should have diagnosed this");
 
@@ -168,7 +168,7 @@ static bool checkArgCount(Sema &S, CallExpr *Call, unsigned DesiredArgCount) {
   SourceRange Range(Call->getArg(DesiredArgCount)->getBeginLoc(),
                     Call->getArg(ArgCount - 1)->getEndLoc());
 
-  return S.Diag(Range.getBegin(), diag::err_typecheck_call_too_many_args)
+  return Diag(Range.getBegin(), diag::err_typecheck_call_too_many_args)
          << 0 /*function call*/ << DesiredArgCount << ArgCount
          << /*is non object*/ 0 << Call->getArg(1)->getSourceRange();
 }
@@ -190,7 +190,7 @@ static bool convertArgumentToType(Sema &S, Expr *&Value, QualType Ty) {
 /// Check that the first argument to __builtin_annotation is an integer
 /// and the second argument is a non-wide string literal.
 static bool BuiltinAnnotation(Sema &S, CallExpr *TheCall) {
-  if (checkArgCount(S, TheCall, 2))
+  if (S.checkArgCount(TheCall, 2))
     return true;
 
   // First argument should be an integer.
@@ -240,7 +240,7 @@ static bool BuiltinMSVCAnnotation(Sema &S, CallExpr *TheCall) {
 /// Check that the argument to __builtin_addressof is a glvalue, and set the
 /// result type to the corresponding pointer type.
 static bool BuiltinAddressof(Sema &S, CallExpr *TheCall) {
-  if (checkArgCount(S, TheCall, 1))
+  if (S.checkArgCount(TheCall, 1))
     return true;
 
   ExprResult Arg(TheCall->getArg(0));
@@ -255,7 +255,7 @@ static bool BuiltinAddressof(Sema &S, CallExpr *TheCall) {
 
 /// Check that the argument to __builtin_function_start is a function.
 static bool BuiltinFunctionStart(Sema &S, CallExpr *TheCall) {
-  if (checkArgCount(S, TheCall, 1))
+  if (S.checkArgCount(TheCall, 1))
     return true;
 
   ExprResult Arg = S.DefaultFunctionArrayLvalueConversion(TheCall->getArg(0));
@@ -279,7 +279,7 @@ static bool BuiltinFunctionStart(Sema &S, CallExpr *TheCall) {
 /// Check the number of arguments and set the result type to
 /// the argument type.
 static bool BuiltinPreserveAI(Sema &S, CallExpr *TheCall) {
-  if (checkArgCount(S, TheCall, 1))
+  if (S.checkArgCount(TheCall, 1))
     return true;
 
   TheCall->setType(TheCall->getArg(0)->getType());
@@ -290,7 +290,7 @@ static bool BuiltinPreserveAI(Sema &S, CallExpr *TheCall) {
 /// __builtin_aligned_{up,down}(value, alignment) is an integer or a pointer
 /// type (but not a function pointer) and that the alignment is a power-of-two.
 static bool BuiltinAlignment(Sema &S, CallExpr *TheCall, unsigned ID) {
-  if (checkArgCount(S, TheCall, 2))
+  if (S.checkArgCount(TheCall, 2))
     return true;
 
   clang::Expr *Source = TheCall->getArg(0);
@@ -368,7 +368,7 @@ static bool BuiltinAlignment(Sema &S, CallExpr *TheCall, unsigned ID) {
 }
 
 static bool BuiltinOverflow(Sema &S, CallExpr *TheCall, unsigned BuiltinID) {
-  if (checkArgCount(S, TheCall, 3))
+  if (S.checkArgCount(TheCall, 3))
     return true;
 
   std::pair<unsigned, const char *> Builtins[] = {
@@ -696,7 +696,7 @@ struct BuiltinDumpStructGenerator {
 } // namespace
 
 static ExprResult BuiltinDumpStruct(Sema &S, CallExpr *TheCall) {
-  if (checkArgCountAtLeast(S, TheCall, 2))
+  if (S.checkArgCountAtLeast(TheCall, 2))
     return ExprError();
 
   ExprResult PtrArgResult = S.DefaultLvalueConversion(TheCall->getArg(0));
@@ -762,7 +762,7 @@ static ExprResult BuiltinDumpStruct(Sema &S, CallExpr *TheCall) {
 }
 
 static bool BuiltinCallWithStaticChain(Sema &S, CallExpr *BuiltinCall) {
-  if (checkArgCount(S, BuiltinCall, 2))
+  if (S.checkArgCount(BuiltinCall, 2))
     return true;
 
   SourceLocation BuiltinLoc = BuiltinCall->getBeginLoc();
@@ -1504,7 +1504,7 @@ static bool checkOpenCLSubgroupExt(Sema &S, CallExpr *Call) {
 }
 
 static bool OpenCLBuiltinNDRangeAndBlock(Sema &S, CallExpr *TheCall) {
-  if (checkArgCount(S, TheCall, 2))
+  if (S.checkArgCount(TheCall, 2))
     return true;
 
   if (checkOpenCLSubgroupExt(S, TheCall))
@@ -1531,7 +1531,7 @@ static bool OpenCLBuiltinNDRangeAndBlock(Sema &S, CallExpr *TheCall) {
 /// get_kernel_work_group_size
 /// and get_kernel_preferred_work_group_size_multiple builtin functions.
 static bool OpenCLBuiltinKernelWorkGroupSize(Sema &S, CallExpr *TheCall) {
-  if (checkArgCount(S, TheCall, 1))
+  if (S.checkArgCount(TheCall, 1))
     return true;
 
   Expr *BlockArg = TheCall->getArg(0);
@@ -1861,7 +1861,7 @@ static bool BuiltinRWPipe(Sema &S, CallExpr *Call) {
 // \param Call The call to the builtin function to be analyzed.
 // \return True if a semantic error was found, false otherwise.
 static bool BuiltinReserveRWPipe(Sema &S, CallExpr *Call) {
-  if (checkArgCount(S, Call, 2))
+  if (S.checkArgCount(Call, 2))
     return true;
 
   if (checkOpenCLPipeArg(S, Call))
@@ -1890,7 +1890,7 @@ static bool BuiltinReserveRWPipe(Sema &S, CallExpr *Call) {
 // \param Call The call to the builtin function to be analyzed.
 // \return True if a semantic error was found, false otherwise.
 static bool BuiltinCommitRWPipe(Sema &S, CallExpr *Call) {
-  if (checkArgCount(S, Call, 2))
+  if (S.checkArgCount(Call, 2))
     return true;
 
   if (checkOpenCLPipeArg(S, Call))
@@ -1913,7 +1913,7 @@ static bool BuiltinCommitRWPipe(Sema &S, CallExpr *Call) {
 // \param Call The call to the builtin function to be analyzed.
 // \return True if a semantic error was found, false otherwise.
 static bool BuiltinPipePackets(Sema &S, CallExpr *Call) {
-  if (checkArgCount(S, Call, 1))
+  if (S.checkArgCount(Call, 1))
     return true;
 
   if (!Call->getArg(0)->getType()->isPipeType()) {
@@ -1932,7 +1932,7 @@ static bool BuiltinPipePackets(Sema &S, CallExpr *Call) {
 // \param Call A pointer to the builtin call.
 // \return True if a semantic error has been found, false otherwise.
 static bool OpenCLBuiltinToAddr(Sema &S, unsigned BuiltinID, CallExpr *Call) {
-  if (checkArgCount(S, Call, 1))
+  if (S.checkArgCount(Call, 1))
     return true;
 
   auto RT = Call->getArg(0)->getType();
@@ -2087,7 +2087,7 @@ static bool checkPointerAuthValue(Sema &S, Expr *&Arg,
 }
 
 static ExprResult PointerAuthStrip(Sema &S, CallExpr *Call) {
-  if (checkArgCount(S, Call, 2))
+  if (S.checkArgCount(Call, 2))
     return ExprError();
   if (checkPointerAuthEnabled(S, Call))
     return ExprError();
@@ -2100,7 +2100,7 @@ static ExprResult PointerAuthStrip(Sema &S, CallExpr *Call) {
 }
 
 static ExprResult PointerAuthBlendDiscriminator(Sema &S, CallExpr *Call) {
-  if (checkArgCount(S, Call, 2))
+  if (S.checkArgCount(Call, 2))
     return ExprError();
   if (checkPointerAuthEnabled(S, Call))
     return ExprError();
@@ -2113,7 +2113,7 @@ static ExprResult PointerAuthBlendDiscriminator(Sema &S, CallExpr *Call) {
 }
 
 static ExprResult PointerAuthSignGenericData(Sema &S, CallExpr *Call) {
-  if (checkArgCount(S, Call, 2))
+  if (S.checkArgCount(Call, 2))
     return ExprError();
   if (checkPointerAuthEnabled(S, Call))
     return ExprError();
@@ -2127,7 +2127,7 @@ static ExprResult PointerAuthSignGenericData(Sema &S, CallExpr *Call) {
 
 static ExprResult PointerAuthSignOrAuth(Sema &S, CallExpr *Call,
                                         PointerAuthOpKind OpKind) {
-  if (checkArgCount(S, Call, 3))
+  if (S.checkArgCount(Call, 3))
     return ExprError();
   if (checkPointerAuthEnabled(S, Call))
     return ExprError();
@@ -2141,7 +2141,7 @@ static ExprResult PointerAuthSignOrAuth(Sema &S, CallExpr *Call,
 }
 
 static ExprResult PointerAuthAuthAndResign(Sema &S, CallExpr *Call) {
-  if (checkArgCount(S, Call, 5))
+  if (S.checkArgCount(Call, 5))
     return ExprError();
   if (checkPointerAuthEnabled(S, Call))
     return ExprError();
@@ -2157,7 +2157,7 @@ static ExprResult PointerAuthAuthAndResign(Sema &S, CallExpr *Call) {
 }
 
 static ExprResult BuiltinLaunder(Sema &S, CallExpr *TheCall) {
-  if (checkArgCount(S, TheCall, 1))
+  if (S.checkArgCount(TheCall, 1))
     return ExprError();
 
   // Compute __builtin_launder's parameter type from the argument.
@@ -2278,7 +2278,7 @@ bool Sema::CheckTSBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
     return CheckSystemZBuiltinFunctionCall(BuiltinID, TheCall);
   case llvm::Triple::x86:
   case llvm::Triple::x86_64:
-    return CheckX86BuiltinFunctionCall(TI, BuiltinID, TheCall);
+    return X86().CheckBuiltinFunctionCall(TI, BuiltinID, TheCall);
   case llvm::Triple::ppc:
   case llvm::Triple::ppcle:
   case llvm::Triple::ppc64:
@@ -2288,7 +2288,7 @@ bool Sema::CheckTSBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
     return CheckAMDGCNBuiltinFunctionCall(BuiltinID, TheCall);
   case llvm::Triple::riscv32:
   case llvm::Triple::riscv64:
-    return CheckRISCVBuiltinFunctionCall(TI, BuiltinID, TheCall);
+    return RISCV().CheckBuiltinFunctionCall(TI, BuiltinID, TheCall);
   case llvm::Triple::loongarch32:
   case llvm::Triple::loongarch64:
     return CheckLoongArchBuiltinFunctionCall(TI, BuiltinID, TheCall);
@@ -2377,7 +2377,7 @@ static bool BuiltinCpu(Sema &S, const TargetInfo &TI, CallExpr *TheCall,
 /// Checks that __builtin_popcountg was called with a single argument, which is
 /// an unsigned integer.
 static bool BuiltinPopcountg(Sema &S, CallExpr *TheCall) {
-  if (checkArgCount(S, TheCall, 1))
+  if (S.checkArgCount(TheCall, 1))
     return true;
 
   ExprResult ArgRes = S.DefaultLvalueConversion(TheCall->getArg(0));
@@ -2401,7 +2401,7 @@ static bool BuiltinPopcountg(Sema &S, CallExpr *TheCall) {
 /// an unsigned integer, and an optional second argument, which is promoted to
 /// an 'int'.
 static bool BuiltinCountZeroBitsGeneric(Sema &S, CallExpr *TheCall) {
-  if (checkArgCountRange(S, TheCall, 1, 2))
+  if (S.checkArgCountRange(TheCall, 1, 2))
     return true;
 
   ExprResult Arg0Res = S.DefaultLvalueConversion(TheCall->getArg(0));
@@ -2625,7 +2625,8 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
       return ExprError();
     break;
   case Builtin::BI__builtin_classify_type:
-    if (checkArgCount(*this, TheCall, 1)) return true;
+    if (checkArgCount(TheCall, 1))
+      return true;
     TheCall->setType(Context.IntTy);
     break;
   case Builtin::BI__builtin_complex:
@@ -2633,7 +2634,8 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
       return ExprError();
     break;
   case Builtin::BI__builtin_constant_p: {
-    if (checkArgCount(*this, TheCall, 1)) return true;
+    if (checkArgCount(TheCall, 1))
+      return true;
     ExprResult Arg = DefaultFunctionArrayLvalueConversion(TheCall->getArg(0));
     if (Arg.isInvalid()) return true;
     TheCall->setArg(0, Arg.get());
@@ -2822,7 +2824,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     return BuiltinDumpStruct(*this, TheCall);
   case Builtin::BI__builtin_expect_with_probability: {
     // We first want to ensure we are called with 3 arguments
-    if (checkArgCount(*this, TheCall, 3))
+    if (checkArgCount(TheCall, 3))
       return ExprError();
     // then check probability is constant float in range [0.0, 1.0]
     const Expr *ProbArg = TheCall->getArg(2);
@@ -2870,7 +2872,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
       return ExprError();
     break;
   case Builtin::BI__GetExceptionInfo:
-    if (checkArgCount(*this, TheCall, 1))
+    if (checkArgCount(TheCall, 1))
       return ExprError();
 
     if (CheckCXXThrowOperand(
@@ -2891,7 +2893,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     // These are all expected to be of the form
     //   T &/&&/* f(U &/&&)
     // where T and U only differ in qualification.
-    if (checkArgCount(*this, TheCall, 1))
+    if (checkArgCount(TheCall, 1))
       return ExprError();
     QualType Param = FDecl->getParamDecl(0)->getType();
     QualType Result = FDecl->getReturnType();
@@ -3129,7 +3131,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
   }
 
   case Builtin::BI__builtin_elementwise_copysign: {
-    if (checkArgCount(*this, TheCall, 2))
+    if (checkArgCount(TheCall, 2))
       return ExprError();
 
     ExprResult Magnitude = UsualUnaryConversions(TheCall->getArg(0));
@@ -3806,7 +3808,7 @@ bool Sema::CheckARMBuiltinExclusiveCall(unsigned BuiltinID, CallExpr *TheCall,
   DeclRefExpr *DRE =cast<DeclRefExpr>(TheCall->getCallee()->IgnoreParenCasts());
 
   // Ensure that we have the proper number of arguments.
-  if (checkArgCount(*this, TheCall, IsLdrex ? 1 : 2))
+  if (checkArgCount(TheCall, IsLdrex ? 1 : 2))
     return true;
 
   // Inspect the pointer argument of the atomic builtin.  This should always be
@@ -4145,7 +4147,7 @@ bool Sema::CheckBPFBuiltinFunctionCall(unsigned BuiltinID,
           BuiltinID == BPF::BI__builtin_preserve_enum_value) &&
          "unexpected BPF builtin");
 
-  if (checkArgCount(*this, TheCall, 2))
+  if (checkArgCount(TheCall, 2))
     return true;
 
   // The second argument needs to be a constant int
@@ -5589,12 +5591,12 @@ bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
   switch (BuiltinID) {
   case Builtin::BI__builtin_hlsl_elementwise_all:
   case Builtin::BI__builtin_hlsl_elementwise_any: {
-    if (checkArgCount(*this, TheCall, 1))
+    if (checkArgCount(TheCall, 1))
       return true;
     break;
   }
   case Builtin::BI__builtin_hlsl_elementwise_clamp: {
-    if (checkArgCount(*this, TheCall, 3))
+    if (checkArgCount(TheCall, 3))
       return true;
     if (CheckVectorElementCallArgs(this, TheCall))
       return true;
@@ -5605,7 +5607,7 @@ bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
     break;
   }
   case Builtin::BI__builtin_hlsl_dot: {
-    if (checkArgCount(*this, TheCall, 2))
+    if (checkArgCount(TheCall, 2))
       return true;
     if (CheckVectorElementCallArgs(this, TheCall))
       return true;
@@ -5639,7 +5641,7 @@ bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
     break;
   }
   case Builtin::BI__builtin_hlsl_lerp: {
-    if (checkArgCount(*this, TheCall, 3))
+    if (checkArgCount(TheCall, 3))
       return true;
     if (CheckVectorElementCallArgs(this, TheCall))
       return true;
@@ -5650,7 +5652,7 @@ bool Sema::CheckHLSLBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
     break;
   }
   case Builtin::BI__builtin_hlsl_mad: {
-    if (checkArgCount(*this, TheCall, 3))
+    if (checkArgCount(TheCall, 3))
       return true;
     if (CheckVectorElementCallArgs(this, TheCall))
       return true;
@@ -5694,6 +5696,28 @@ bool Sema::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
   // position of memory order and scope arguments in the builtin
   unsigned OrderIndex, ScopeIndex;
   switch (BuiltinID) {
+  case AMDGPU::BI__builtin_amdgcn_global_load_lds: {
+    constexpr const int SizeIdx = 2;
+    llvm::APSInt Size;
+    Expr *ArgExpr = TheCall->getArg(SizeIdx);
+    ExprResult R = VerifyIntegerConstantExpression(ArgExpr, &Size);
+    if (R.isInvalid())
+      return true;
+    switch (Size.getSExtValue()) {
+    case 1:
+    case 2:
+    case 4:
+      return false;
+    default:
+      Diag(ArgExpr->getExprLoc(),
+           diag::err_amdgcn_global_load_lds_size_invalid_value)
+          << ArgExpr->getSourceRange();
+      Diag(ArgExpr->getExprLoc(),
+           diag::note_amdgcn_global_load_lds_size_valid_value)
+          << ArgExpr->getSourceRange();
+      return true;
+    }
+  }
   case AMDGPU::BI__builtin_amdgcn_get_fpenv:
   case AMDGPU::BI__builtin_amdgcn_set_fpenv:
     return false;
@@ -5753,866 +5777,6 @@ bool Sema::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
   return false;
 }
 
-bool Sema::CheckRISCVLMUL(CallExpr *TheCall, unsigned ArgNum) {
-  llvm::APSInt Result;
-
-  // We can't check the value of a dependent argument.
-  Expr *Arg = TheCall->getArg(ArgNum);
-  if (Arg->isTypeDependent() || Arg->isValueDependent())
-    return false;
-
-  // Check constant-ness first.
-  if (BuiltinConstantArg(TheCall, ArgNum, Result))
-    return true;
-
-  int64_t Val = Result.getSExtValue();
-  if ((Val >= 0 && Val <= 3) || (Val >= 5 && Val <= 7))
-    return false;
-
-  return Diag(TheCall->getBeginLoc(), diag::err_riscv_builtin_invalid_lmul)
-         << Arg->getSourceRange();
-}
-
-static bool CheckInvalidVLENandLMUL(const TargetInfo &TI, CallExpr *TheCall,
-                                    Sema &S, QualType Type, int EGW) {
-  assert((EGW == 128 || EGW == 256) && "EGW can only be 128 or 256 bits");
-
-  // LMUL * VLEN >= EGW
-  ASTContext::BuiltinVectorTypeInfo Info =
-      S.Context.getBuiltinVectorTypeInfo(Type->castAs<BuiltinType>());
-  unsigned ElemSize = S.Context.getTypeSize(Info.ElementType);
-  unsigned MinElemCount = Info.EC.getKnownMinValue();
-
-  unsigned EGS = EGW / ElemSize;
-  // If EGS is less than or equal to the minimum number of elements, then the
-  // type is valid.
-  if (EGS <= MinElemCount)
-    return false;
-
-  // Otherwise, we need vscale to be at least EGS / MinElemCont.
-  assert(EGS % MinElemCount == 0);
-  unsigned VScaleFactor = EGS / MinElemCount;
-  // Vscale is VLEN/RVVBitsPerBlock.
-  unsigned MinRequiredVLEN = VScaleFactor * llvm::RISCV::RVVBitsPerBlock;
-  std::string RequiredExt = "zvl" + std::to_string(MinRequiredVLEN) + "b";
-  if (!TI.hasFeature(RequiredExt))
-    return S.Diag(TheCall->getBeginLoc(),
-        diag::err_riscv_type_requires_extension) << Type << RequiredExt;
-
-  return false;
-}
-
-bool Sema::CheckRISCVBuiltinFunctionCall(const TargetInfo &TI,
-                                         unsigned BuiltinID,
-                                         CallExpr *TheCall) {
-  // vmulh.vv, vmulh.vx, vmulhu.vv, vmulhu.vx, vmulhsu.vv, vmulhsu.vx,
-  // vsmul.vv, vsmul.vx are not included for EEW=64 in Zve64*.
-  switch (BuiltinID) {
-  default:
-    break;
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vv:
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vx:
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vx_tu:
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vv_m:
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vx_m:
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vv_mu:
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vx_mu:
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vv_tum:
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vx_tum:
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vv_tumu:
-  case RISCVVector::BI__builtin_rvv_vmulhsu_vx_tumu:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vv:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vx:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vx_tu:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vv_m:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vx_m:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vv_mu:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vx_mu:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vv_tum:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vx_tum:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vv_tumu:
-  case RISCVVector::BI__builtin_rvv_vmulhu_vx_tumu:
-  case RISCVVector::BI__builtin_rvv_vmulh_vv:
-  case RISCVVector::BI__builtin_rvv_vmulh_vx:
-  case RISCVVector::BI__builtin_rvv_vmulh_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vmulh_vx_tu:
-  case RISCVVector::BI__builtin_rvv_vmulh_vv_m:
-  case RISCVVector::BI__builtin_rvv_vmulh_vx_m:
-  case RISCVVector::BI__builtin_rvv_vmulh_vv_mu:
-  case RISCVVector::BI__builtin_rvv_vmulh_vx_mu:
-  case RISCVVector::BI__builtin_rvv_vmulh_vv_tum:
-  case RISCVVector::BI__builtin_rvv_vmulh_vx_tum:
-  case RISCVVector::BI__builtin_rvv_vmulh_vv_tumu:
-  case RISCVVector::BI__builtin_rvv_vmulh_vx_tumu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx_tu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv_m:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx_m:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv_mu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx_mu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv_tum:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx_tum:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv_tumu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx_tumu: {
-    ASTContext::BuiltinVectorTypeInfo Info = Context.getBuiltinVectorTypeInfo(
-        TheCall->getType()->castAs<BuiltinType>());
-
-    if (Context.getTypeSize(Info.ElementType) == 64 && !TI.hasFeature("v"))
-      return Diag(TheCall->getBeginLoc(),
-                  diag::err_riscv_builtin_requires_extension)
-             << /* IsExtension */ true << TheCall->getSourceRange() << "v";
-
-    break;
-  }
-  }
-
-  switch (BuiltinID) {
-  case RISCVVector::BI__builtin_rvv_vsetvli:
-    return BuiltinConstantArgRange(TheCall, 1, 0, 3) ||
-           CheckRISCVLMUL(TheCall, 2);
-  case RISCVVector::BI__builtin_rvv_vsetvlimax:
-    return BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
-           CheckRISCVLMUL(TheCall, 1);
-  case RISCVVector::BI__builtin_rvv_vget_v: {
-    ASTContext::BuiltinVectorTypeInfo ResVecInfo =
-        Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(
-            TheCall->getType().getCanonicalType().getTypePtr()));
-    ASTContext::BuiltinVectorTypeInfo VecInfo =
-        Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(
-            TheCall->getArg(0)->getType().getCanonicalType().getTypePtr()));
-    unsigned MaxIndex;
-    if (VecInfo.NumVectors != 1) // vget for tuple type
-      MaxIndex = VecInfo.NumVectors;
-    else // vget for non-tuple type
-      MaxIndex = (VecInfo.EC.getKnownMinValue() * VecInfo.NumVectors) /
-                 (ResVecInfo.EC.getKnownMinValue() * ResVecInfo.NumVectors);
-    return BuiltinConstantArgRange(TheCall, 1, 0, MaxIndex - 1);
-  }
-  case RISCVVector::BI__builtin_rvv_vset_v: {
-    ASTContext::BuiltinVectorTypeInfo ResVecInfo =
-        Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(
-            TheCall->getType().getCanonicalType().getTypePtr()));
-    ASTContext::BuiltinVectorTypeInfo VecInfo =
-        Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(
-            TheCall->getArg(2)->getType().getCanonicalType().getTypePtr()));
-    unsigned MaxIndex;
-    if (ResVecInfo.NumVectors != 1) // vset for tuple type
-      MaxIndex = ResVecInfo.NumVectors;
-    else // vset fo non-tuple type
-      MaxIndex = (ResVecInfo.EC.getKnownMinValue() * ResVecInfo.NumVectors) /
-                 (VecInfo.EC.getKnownMinValue() * VecInfo.NumVectors);
-    return BuiltinConstantArgRange(TheCall, 1, 0, MaxIndex - 1);
-  }
-  // Vector Crypto
-  case RISCVVector::BI__builtin_rvv_vaeskf1_vi_tu:
-  case RISCVVector::BI__builtin_rvv_vaeskf2_vi_tu:
-  case RISCVVector::BI__builtin_rvv_vaeskf2_vi:
-  case RISCVVector::BI__builtin_rvv_vsm4k_vi_tu: {
-    QualType Op1Type = TheCall->getArg(0)->getType();
-    QualType Op2Type = TheCall->getArg(1)->getType();
-    return CheckInvalidVLENandLMUL(TI, TheCall, *this, Op1Type, 128) ||
-           CheckInvalidVLENandLMUL(TI, TheCall, *this, Op2Type, 128) ||
-           BuiltinConstantArgRange(TheCall, 2, 0, 31);
-  }
-  case RISCVVector::BI__builtin_rvv_vsm3c_vi_tu:
-  case RISCVVector::BI__builtin_rvv_vsm3c_vi: {
-    QualType Op1Type = TheCall->getArg(0)->getType();
-    return CheckInvalidVLENandLMUL(TI, TheCall, *this, Op1Type, 256) ||
-           BuiltinConstantArgRange(TheCall, 2, 0, 31);
-  }
-  case RISCVVector::BI__builtin_rvv_vaeskf1_vi:
-  case RISCVVector::BI__builtin_rvv_vsm4k_vi: {
-    QualType Op1Type = TheCall->getArg(0)->getType();
-    return CheckInvalidVLENandLMUL(TI, TheCall, *this, Op1Type, 128) ||
-           BuiltinConstantArgRange(TheCall, 1, 0, 31);
-  }
-  case RISCVVector::BI__builtin_rvv_vaesdf_vv:
-  case RISCVVector::BI__builtin_rvv_vaesdf_vs:
-  case RISCVVector::BI__builtin_rvv_vaesdm_vv:
-  case RISCVVector::BI__builtin_rvv_vaesdm_vs:
-  case RISCVVector::BI__builtin_rvv_vaesef_vv:
-  case RISCVVector::BI__builtin_rvv_vaesef_vs:
-  case RISCVVector::BI__builtin_rvv_vaesem_vv:
-  case RISCVVector::BI__builtin_rvv_vaesem_vs:
-  case RISCVVector::BI__builtin_rvv_vaesz_vs:
-  case RISCVVector::BI__builtin_rvv_vsm4r_vv:
-  case RISCVVector::BI__builtin_rvv_vsm4r_vs:
-  case RISCVVector::BI__builtin_rvv_vaesdf_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vaesdf_vs_tu:
-  case RISCVVector::BI__builtin_rvv_vaesdm_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vaesdm_vs_tu:
-  case RISCVVector::BI__builtin_rvv_vaesef_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vaesef_vs_tu:
-  case RISCVVector::BI__builtin_rvv_vaesem_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vaesem_vs_tu:
-  case RISCVVector::BI__builtin_rvv_vaesz_vs_tu:
-  case RISCVVector::BI__builtin_rvv_vsm4r_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vsm4r_vs_tu: {
-    QualType Op1Type = TheCall->getArg(0)->getType();
-    QualType Op2Type = TheCall->getArg(1)->getType();
-    return CheckInvalidVLENandLMUL(TI, TheCall, *this, Op1Type, 128) ||
-           CheckInvalidVLENandLMUL(TI, TheCall, *this, Op2Type, 128);
-  }
-  case RISCVVector::BI__builtin_rvv_vsha2ch_vv:
-  case RISCVVector::BI__builtin_rvv_vsha2cl_vv:
-  case RISCVVector::BI__builtin_rvv_vsha2ms_vv:
-  case RISCVVector::BI__builtin_rvv_vsha2ch_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vsha2cl_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vsha2ms_vv_tu: {
-    QualType Op1Type = TheCall->getArg(0)->getType();
-    QualType Op2Type = TheCall->getArg(1)->getType();
-    QualType Op3Type = TheCall->getArg(2)->getType();
-    ASTContext::BuiltinVectorTypeInfo Info =
-        Context.getBuiltinVectorTypeInfo(Op1Type->castAs<BuiltinType>());
-    uint64_t ElemSize = Context.getTypeSize(Info.ElementType);
-    if (ElemSize == 64 && !TI.hasFeature("zvknhb"))
-      return Diag(TheCall->getBeginLoc(),
-                  diag::err_riscv_builtin_requires_extension)
-             << /* IsExtension */ true << TheCall->getSourceRange() << "zvknb";
-
-    return CheckInvalidVLENandLMUL(TI, TheCall, *this, Op1Type, ElemSize * 4) ||
-           CheckInvalidVLENandLMUL(TI, TheCall, *this, Op2Type, ElemSize * 4) ||
-           CheckInvalidVLENandLMUL(TI, TheCall, *this, Op3Type, ElemSize * 4);
-  }
-
-  case RISCVVector::BI__builtin_rvv_sf_vc_i_se:
-    // bit_27_26, bit_24_20, bit_11_7, simm5, sew, log2lmul
-    return BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
-           BuiltinConstantArgRange(TheCall, 1, 0, 31) ||
-           BuiltinConstantArgRange(TheCall, 2, 0, 31) ||
-           BuiltinConstantArgRange(TheCall, 3, -16, 15) ||
-           CheckRISCVLMUL(TheCall, 5);
-  case RISCVVector::BI__builtin_rvv_sf_vc_iv_se:
-    // bit_27_26, bit_11_7, vs2, simm5
-    return BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
-           BuiltinConstantArgRange(TheCall, 1, 0, 31) ||
-           BuiltinConstantArgRange(TheCall, 3, -16, 15);
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_i:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_i_se:
-    // bit_27_26, bit_24_20, simm5
-    return BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
-           BuiltinConstantArgRange(TheCall, 1, 0, 31) ||
-           BuiltinConstantArgRange(TheCall, 2, -16, 15);
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_iv:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_iv_se:
-    // bit_27_26, vs2, simm5
-    return BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
-           BuiltinConstantArgRange(TheCall, 2, -16, 15);
-  case RISCVVector::BI__builtin_rvv_sf_vc_ivv_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_ivw_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_ivv:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_ivw:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_ivv_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_ivw_se:
-    // bit_27_26, vd, vs2, simm5
-    return BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
-           BuiltinConstantArgRange(TheCall, 3, -16, 15);
-  case RISCVVector::BI__builtin_rvv_sf_vc_x_se:
-    // bit_27_26, bit_24_20, bit_11_7, xs1, sew, log2lmul
-    return BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
-           BuiltinConstantArgRange(TheCall, 1, 0, 31) ||
-           BuiltinConstantArgRange(TheCall, 2, 0, 31) ||
-           CheckRISCVLMUL(TheCall, 5);
-  case RISCVVector::BI__builtin_rvv_sf_vc_xv_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_vv_se:
-    // bit_27_26, bit_11_7, vs2, xs1/vs1
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_x:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_x_se:
-    // bit_27_26, bit_24-20, xs1
-    return BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
-           BuiltinConstantArgRange(TheCall, 1, 0, 31);
-  case RISCVVector::BI__builtin_rvv_sf_vc_vvv_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_xvv_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_vvw_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_xvw_se:
-    // bit_27_26, vd, vs2, xs1
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_xv:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_vv:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_xv_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_vv_se:
-    // bit_27_26, vs2, xs1/vs1
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_xvv:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_vvv:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_xvw:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_vvw:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_xvv_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_vvv_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_xvw_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_vvw_se:
-    // bit_27_26, vd, vs2, xs1/vs1
-    return BuiltinConstantArgRange(TheCall, 0, 0, 3);
-  case RISCVVector::BI__builtin_rvv_sf_vc_fv_se:
-    // bit_26, bit_11_7, vs2, fs1
-    return BuiltinConstantArgRange(TheCall, 0, 0, 1) ||
-           BuiltinConstantArgRange(TheCall, 1, 0, 31);
-  case RISCVVector::BI__builtin_rvv_sf_vc_fvv_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_fvw_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_fvv:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_fvw:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_fvv_se:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_fvw_se:
-    // bit_26, vd, vs2, fs1
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_fv:
-  case RISCVVector::BI__builtin_rvv_sf_vc_v_fv_se:
-    // bit_26, vs2, fs1
-    return BuiltinConstantArgRange(TheCall, 0, 0, 1);
-  // Check if byteselect is in [0, 3]
-  case RISCV::BI__builtin_riscv_aes32dsi:
-  case RISCV::BI__builtin_riscv_aes32dsmi:
-  case RISCV::BI__builtin_riscv_aes32esi:
-  case RISCV::BI__builtin_riscv_aes32esmi:
-  case RISCV::BI__builtin_riscv_sm4ks:
-  case RISCV::BI__builtin_riscv_sm4ed:
-    return BuiltinConstantArgRange(TheCall, 2, 0, 3);
-  // Check if rnum is in [0, 10]
-  case RISCV::BI__builtin_riscv_aes64ks1i:
-    return BuiltinConstantArgRange(TheCall, 1, 0, 10);
-  // Check if value range for vxrm is in [0, 3]
-  case RISCVVector::BI__builtin_rvv_vaaddu_vv:
-  case RISCVVector::BI__builtin_rvv_vaaddu_vx:
-  case RISCVVector::BI__builtin_rvv_vaadd_vv:
-  case RISCVVector::BI__builtin_rvv_vaadd_vx:
-  case RISCVVector::BI__builtin_rvv_vasubu_vv:
-  case RISCVVector::BI__builtin_rvv_vasubu_vx:
-  case RISCVVector::BI__builtin_rvv_vasub_vv:
-  case RISCVVector::BI__builtin_rvv_vasub_vx:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx:
-  case RISCVVector::BI__builtin_rvv_vssra_vv:
-  case RISCVVector::BI__builtin_rvv_vssra_vx:
-  case RISCVVector::BI__builtin_rvv_vssrl_vv:
-  case RISCVVector::BI__builtin_rvv_vssrl_vx:
-  case RISCVVector::BI__builtin_rvv_vnclip_wv:
-  case RISCVVector::BI__builtin_rvv_vnclip_wx:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wv:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wx:
-    return BuiltinConstantArgRange(TheCall, 2, 0, 3);
-  case RISCVVector::BI__builtin_rvv_vaaddu_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vaaddu_vx_tu:
-  case RISCVVector::BI__builtin_rvv_vaadd_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vaadd_vx_tu:
-  case RISCVVector::BI__builtin_rvv_vasubu_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vasubu_vx_tu:
-  case RISCVVector::BI__builtin_rvv_vasub_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vasub_vx_tu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx_tu:
-  case RISCVVector::BI__builtin_rvv_vssra_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vssra_vx_tu:
-  case RISCVVector::BI__builtin_rvv_vssrl_vv_tu:
-  case RISCVVector::BI__builtin_rvv_vssrl_vx_tu:
-  case RISCVVector::BI__builtin_rvv_vnclip_wv_tu:
-  case RISCVVector::BI__builtin_rvv_vnclip_wx_tu:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wv_tu:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wx_tu:
-  case RISCVVector::BI__builtin_rvv_vaaddu_vv_m:
-  case RISCVVector::BI__builtin_rvv_vaaddu_vx_m:
-  case RISCVVector::BI__builtin_rvv_vaadd_vv_m:
-  case RISCVVector::BI__builtin_rvv_vaadd_vx_m:
-  case RISCVVector::BI__builtin_rvv_vasubu_vv_m:
-  case RISCVVector::BI__builtin_rvv_vasubu_vx_m:
-  case RISCVVector::BI__builtin_rvv_vasub_vv_m:
-  case RISCVVector::BI__builtin_rvv_vasub_vx_m:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv_m:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx_m:
-  case RISCVVector::BI__builtin_rvv_vssra_vv_m:
-  case RISCVVector::BI__builtin_rvv_vssra_vx_m:
-  case RISCVVector::BI__builtin_rvv_vssrl_vv_m:
-  case RISCVVector::BI__builtin_rvv_vssrl_vx_m:
-  case RISCVVector::BI__builtin_rvv_vnclip_wv_m:
-  case RISCVVector::BI__builtin_rvv_vnclip_wx_m:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wv_m:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wx_m:
-    return BuiltinConstantArgRange(TheCall, 3, 0, 3);
-  case RISCVVector::BI__builtin_rvv_vaaddu_vv_tum:
-  case RISCVVector::BI__builtin_rvv_vaaddu_vv_tumu:
-  case RISCVVector::BI__builtin_rvv_vaaddu_vv_mu:
-  case RISCVVector::BI__builtin_rvv_vaaddu_vx_tum:
-  case RISCVVector::BI__builtin_rvv_vaaddu_vx_tumu:
-  case RISCVVector::BI__builtin_rvv_vaaddu_vx_mu:
-  case RISCVVector::BI__builtin_rvv_vaadd_vv_tum:
-  case RISCVVector::BI__builtin_rvv_vaadd_vv_tumu:
-  case RISCVVector::BI__builtin_rvv_vaadd_vv_mu:
-  case RISCVVector::BI__builtin_rvv_vaadd_vx_tum:
-  case RISCVVector::BI__builtin_rvv_vaadd_vx_tumu:
-  case RISCVVector::BI__builtin_rvv_vaadd_vx_mu:
-  case RISCVVector::BI__builtin_rvv_vasubu_vv_tum:
-  case RISCVVector::BI__builtin_rvv_vasubu_vv_tumu:
-  case RISCVVector::BI__builtin_rvv_vasubu_vv_mu:
-  case RISCVVector::BI__builtin_rvv_vasubu_vx_tum:
-  case RISCVVector::BI__builtin_rvv_vasubu_vx_tumu:
-  case RISCVVector::BI__builtin_rvv_vasubu_vx_mu:
-  case RISCVVector::BI__builtin_rvv_vasub_vv_tum:
-  case RISCVVector::BI__builtin_rvv_vasub_vv_tumu:
-  case RISCVVector::BI__builtin_rvv_vasub_vv_mu:
-  case RISCVVector::BI__builtin_rvv_vasub_vx_tum:
-  case RISCVVector::BI__builtin_rvv_vasub_vx_tumu:
-  case RISCVVector::BI__builtin_rvv_vasub_vx_mu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv_mu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx_mu:
-  case RISCVVector::BI__builtin_rvv_vssra_vv_mu:
-  case RISCVVector::BI__builtin_rvv_vssra_vx_mu:
-  case RISCVVector::BI__builtin_rvv_vssrl_vv_mu:
-  case RISCVVector::BI__builtin_rvv_vssrl_vx_mu:
-  case RISCVVector::BI__builtin_rvv_vnclip_wv_mu:
-  case RISCVVector::BI__builtin_rvv_vnclip_wx_mu:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wv_mu:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wx_mu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv_tum:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx_tum:
-  case RISCVVector::BI__builtin_rvv_vssra_vv_tum:
-  case RISCVVector::BI__builtin_rvv_vssra_vx_tum:
-  case RISCVVector::BI__builtin_rvv_vssrl_vv_tum:
-  case RISCVVector::BI__builtin_rvv_vssrl_vx_tum:
-  case RISCVVector::BI__builtin_rvv_vnclip_wv_tum:
-  case RISCVVector::BI__builtin_rvv_vnclip_wx_tum:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wv_tum:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wx_tum:
-  case RISCVVector::BI__builtin_rvv_vsmul_vv_tumu:
-  case RISCVVector::BI__builtin_rvv_vsmul_vx_tumu:
-  case RISCVVector::BI__builtin_rvv_vssra_vv_tumu:
-  case RISCVVector::BI__builtin_rvv_vssra_vx_tumu:
-  case RISCVVector::BI__builtin_rvv_vssrl_vv_tumu:
-  case RISCVVector::BI__builtin_rvv_vssrl_vx_tumu:
-  case RISCVVector::BI__builtin_rvv_vnclip_wv_tumu:
-  case RISCVVector::BI__builtin_rvv_vnclip_wx_tumu:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wv_tumu:
-  case RISCVVector::BI__builtin_rvv_vnclipu_wx_tumu:
-    return BuiltinConstantArgRange(TheCall, 4, 0, 3);
-  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm:
-  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm:
-  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm:
-  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm:
-  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm:
-  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm:
-    return BuiltinConstantArgRange(TheCall, 1, 0, 4);
-  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm:
-  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfredosum_vs_rm:
-  case RISCVVector::BI__builtin_rvv_vfredusum_vs_rm:
-  case RISCVVector::BI__builtin_rvv_vfwredosum_vs_rm:
-  case RISCVVector::BI__builtin_rvv_vfwredusum_vs_rm:
-  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm_m:
-    return BuiltinConstantArgRange(TheCall, 2, 0, 4);
-  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfredosum_vs_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfredusum_vs_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwredosum_vs_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwredusum_vs_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm:
-  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm_tu:
-  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfredosum_vs_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfredusum_vs_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwredosum_vs_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwredusum_vs_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm_mu:
-    return BuiltinConstantArgRange(TheCall, 3, 0, 4);
-  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm_m:
-  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfredosum_vs_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfredusum_vs_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwredosum_vs_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfwredusum_vs_rm_tum:
-  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm_tumu:
-  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm_mu:
-  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm_mu:
-    return BuiltinConstantArgRange(TheCall, 4, 0, 4);
-  case RISCV::BI__builtin_riscv_ntl_load:
-  case RISCV::BI__builtin_riscv_ntl_store:
-    DeclRefExpr *DRE =
-        cast<DeclRefExpr>(TheCall->getCallee()->IgnoreParenCasts());
-    assert((BuiltinID == RISCV::BI__builtin_riscv_ntl_store ||
-            BuiltinID == RISCV::BI__builtin_riscv_ntl_load) &&
-           "Unexpected RISC-V nontemporal load/store builtin!");
-    bool IsStore = BuiltinID == RISCV::BI__builtin_riscv_ntl_store;
-    unsigned NumArgs = IsStore ? 3 : 2;
-
-    if (checkArgCountAtLeast(*this, TheCall, NumArgs - 1))
-      return true;
-
-    if (checkArgCountAtMost(*this, TheCall, NumArgs))
-      return true;
-
-    // Domain value should be compile-time constant.
-    // 2 <= domain <= 5
-    if (TheCall->getNumArgs() == NumArgs &&
-        BuiltinConstantArgRange(TheCall, NumArgs - 1, 2, 5))
-      return true;
-
-    Expr *PointerArg = TheCall->getArg(0);
-    ExprResult PointerArgResult =
-        DefaultFunctionArrayLvalueConversion(PointerArg);
-
-    if (PointerArgResult.isInvalid())
-      return true;
-    PointerArg = PointerArgResult.get();
-
-    const PointerType *PtrType = PointerArg->getType()->getAs<PointerType>();
-    if (!PtrType) {
-      Diag(DRE->getBeginLoc(), diag::err_nontemporal_builtin_must_be_pointer)
-          << PointerArg->getType() << PointerArg->getSourceRange();
-      return true;
-    }
-
-    QualType ValType = PtrType->getPointeeType();
-    ValType = ValType.getUnqualifiedType();
-    if (!ValType->isIntegerType() && !ValType->isAnyPointerType() &&
-        !ValType->isBlockPointerType() && !ValType->isFloatingType() &&
-        !ValType->isVectorType() && !ValType->isRVVSizelessBuiltinType()) {
-      Diag(DRE->getBeginLoc(),
-           diag::err_nontemporal_builtin_must_be_pointer_intfltptr_or_vector)
-          << PointerArg->getType() << PointerArg->getSourceRange();
-      return true;
-    }
-
-    if (!IsStore) {
-      TheCall->setType(ValType);
-      return false;
-    }
-
-    ExprResult ValArg = TheCall->getArg(1);
-    InitializedEntity Entity = InitializedEntity::InitializeParameter(
-        Context, ValType, /*consume*/ false);
-    ValArg = PerformCopyInitialization(Entity, SourceLocation(), ValArg);
-    if (ValArg.isInvalid())
-      return true;
-
-    TheCall->setArg(1, ValArg.get());
-    TheCall->setType(Context.VoidTy);
-    return false;
-  }
-
-  return false;
-}
-
 bool Sema::CheckSystemZBuiltinFunctionCall(unsigned BuiltinID,
                                            CallExpr *TheCall) {
   if (BuiltinID == SystemZ::BI__builtin_tabort) {
@@ -6708,38 +5872,6 @@ bool Sema::CheckWebAssemblyBuiltinFunctionCall(const TargetInfo &TI,
   return false;
 }
 
-void Sema::checkRVVTypeSupport(QualType Ty, SourceLocation Loc, Decl *D,
-                               const llvm::StringMap<bool> &FeatureMap) {
-  ASTContext::BuiltinVectorTypeInfo Info =
-      Context.getBuiltinVectorTypeInfo(Ty->castAs<BuiltinType>());
-  unsigned EltSize = Context.getTypeSize(Info.ElementType);
-  unsigned MinElts = Info.EC.getKnownMinValue();
-
-  if (Info.ElementType->isSpecificBuiltinType(BuiltinType::Double) &&
-      !FeatureMap.lookup("zve64d"))
-    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve64d";
-  // (ELEN, LMUL) pairs of (8, mf8), (16, mf4), (32, mf2), (64, m1) requires at
-  // least zve64x
-  else if (((EltSize == 64 && Info.ElementType->isIntegerType()) ||
-            MinElts == 1) &&
-           !FeatureMap.lookup("zve64x"))
-    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve64x";
-  else if (Info.ElementType->isFloat16Type() && !FeatureMap.lookup("zvfh") &&
-           !FeatureMap.lookup("zvfhmin"))
-    Diag(Loc, diag::err_riscv_type_requires_extension, D)
-        << Ty << "zvfh or zvfhmin";
-  else if (Info.ElementType->isBFloat16Type() &&
-           !FeatureMap.lookup("experimental-zvfbfmin"))
-    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zvfbfmin";
-  else if (Info.ElementType->isSpecificBuiltinType(BuiltinType::Float) &&
-           !FeatureMap.lookup("zve32f"))
-    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve32f";
-  // Given that caller already checked isRVVType() before calling this function,
-  // if we don't have at least zve32x supported, then we need to emit error.
-  else if (!FeatureMap.lookup("zve32x"))
-    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve32x";
-}
-
 bool Sema::CheckNVPTXBuiltinFunctionCall(const TargetInfo &TI,
                                          unsigned BuiltinID,
                                          CallExpr *TheCall) {
@@ -6748,862 +5880,12 @@ bool Sema::CheckNVPTXBuiltinFunctionCall(const TargetInfo &TI,
   case NVPTX::BI__nvvm_cp_async_ca_shared_global_8:
   case NVPTX::BI__nvvm_cp_async_ca_shared_global_16:
   case NVPTX::BI__nvvm_cp_async_cg_shared_global_16:
-    return checkArgCountAtMost(*this, TheCall, 3);
-  }
-
-  return false;
-}
-
-// Check if the rounding mode is legal.
-bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
-  // Indicates if this instruction has rounding control or just SAE.
-  bool HasRC = false;
-
-  unsigned ArgNum = 0;
-  switch (BuiltinID) {
-  default:
-    return false;
-  case X86::BI__builtin_ia32_vcvttsd2si32:
-  case X86::BI__builtin_ia32_vcvttsd2si64:
-  case X86::BI__builtin_ia32_vcvttsd2usi32:
-  case X86::BI__builtin_ia32_vcvttsd2usi64:
-  case X86::BI__builtin_ia32_vcvttss2si32:
-  case X86::BI__builtin_ia32_vcvttss2si64:
-  case X86::BI__builtin_ia32_vcvttss2usi32:
-  case X86::BI__builtin_ia32_vcvttss2usi64:
-  case X86::BI__builtin_ia32_vcvttsh2si32:
-  case X86::BI__builtin_ia32_vcvttsh2si64:
-  case X86::BI__builtin_ia32_vcvttsh2usi32:
-  case X86::BI__builtin_ia32_vcvttsh2usi64:
-    ArgNum = 1;
-    break;
-  case X86::BI__builtin_ia32_maxpd512:
-  case X86::BI__builtin_ia32_maxps512:
-  case X86::BI__builtin_ia32_minpd512:
-  case X86::BI__builtin_ia32_minps512:
-  case X86::BI__builtin_ia32_maxph512:
-  case X86::BI__builtin_ia32_minph512:
-    ArgNum = 2;
-    break;
-  case X86::BI__builtin_ia32_vcvtph2pd512_mask:
-  case X86::BI__builtin_ia32_vcvtph2psx512_mask:
-  case X86::BI__builtin_ia32_cvtps2pd512_mask:
-  case X86::BI__builtin_ia32_cvttpd2dq512_mask:
-  case X86::BI__builtin_ia32_cvttpd2qq512_mask:
-  case X86::BI__builtin_ia32_cvttpd2udq512_mask:
-  case X86::BI__builtin_ia32_cvttpd2uqq512_mask:
-  case X86::BI__builtin_ia32_cvttps2dq512_mask:
-  case X86::BI__builtin_ia32_cvttps2qq512_mask:
-  case X86::BI__builtin_ia32_cvttps2udq512_mask:
-  case X86::BI__builtin_ia32_cvttps2uqq512_mask:
-  case X86::BI__builtin_ia32_vcvttph2w512_mask:
-  case X86::BI__builtin_ia32_vcvttph2uw512_mask:
-  case X86::BI__builtin_ia32_vcvttph2dq512_mask:
-  case X86::BI__builtin_ia32_vcvttph2udq512_mask:
-  case X86::BI__builtin_ia32_vcvttph2qq512_mask:
-  case X86::BI__builtin_ia32_vcvttph2uqq512_mask:
-  case X86::BI__builtin_ia32_exp2pd_mask:
-  case X86::BI__builtin_ia32_exp2ps_mask:
-  case X86::BI__builtin_ia32_getexppd512_mask:
-  case X86::BI__builtin_ia32_getexpps512_mask:
-  case X86::BI__builtin_ia32_getexpph512_mask:
-  case X86::BI__builtin_ia32_rcp28pd_mask:
-  case X86::BI__builtin_ia32_rcp28ps_mask:
-  case X86::BI__builtin_ia32_rsqrt28pd_mask:
-  case X86::BI__builtin_ia32_rsqrt28ps_mask:
-  case X86::BI__builtin_ia32_vcomisd:
-  case X86::BI__builtin_ia32_vcomiss:
-  case X86::BI__builtin_ia32_vcomish:
-  case X86::BI__builtin_ia32_vcvtph2ps512_mask:
-    ArgNum = 3;
-    break;
-  case X86::BI__builtin_ia32_cmppd512_mask:
-  case X86::BI__builtin_ia32_cmpps512_mask:
-  case X86::BI__builtin_ia32_cmpsd_mask:
-  case X86::BI__builtin_ia32_cmpss_mask:
-  case X86::BI__builtin_ia32_cmpsh_mask:
-  case X86::BI__builtin_ia32_vcvtsh2sd_round_mask:
-  case X86::BI__builtin_ia32_vcvtsh2ss_round_mask:
-  case X86::BI__builtin_ia32_cvtss2sd_round_mask:
-  case X86::BI__builtin_ia32_getexpsd128_round_mask:
-  case X86::BI__builtin_ia32_getexpss128_round_mask:
-  case X86::BI__builtin_ia32_getexpsh128_round_mask:
-  case X86::BI__builtin_ia32_getmantpd512_mask:
-  case X86::BI__builtin_ia32_getmantps512_mask:
-  case X86::BI__builtin_ia32_getmantph512_mask:
-  case X86::BI__builtin_ia32_maxsd_round_mask:
-  case X86::BI__builtin_ia32_maxss_round_mask:
-  case X86::BI__builtin_ia32_maxsh_round_mask:
-  case X86::BI__builtin_ia32_minsd_round_mask:
-  case X86::BI__builtin_ia32_minss_round_mask:
-  case X86::BI__builtin_ia32_minsh_round_mask:
-  case X86::BI__builtin_ia32_rcp28sd_round_mask:
-  case X86::BI__builtin_ia32_rcp28ss_round_mask:
-  case X86::BI__builtin_ia32_reducepd512_mask:
-  case X86::BI__builtin_ia32_reduceps512_mask:
-  case X86::BI__builtin_ia32_reduceph512_mask:
-  case X86::BI__builtin_ia32_rndscalepd_mask:
-  case X86::BI__builtin_ia32_rndscaleps_mask:
-  case X86::BI__builtin_ia32_rndscaleph_mask:
-  case X86::BI__builtin_ia32_rsqrt28sd_round_mask:
-  case X86::BI__builtin_ia32_rsqrt28ss_round_mask:
-    ArgNum = 4;
-    break;
-  case X86::BI__builtin_ia32_fixupimmpd512_mask:
-  case X86::BI__builtin_ia32_fixupimmpd512_maskz:
-  case X86::BI__builtin_ia32_fixupimmps512_mask:
-  case X86::BI__builtin_ia32_fixupimmps512_maskz:
-  case X86::BI__builtin_ia32_fixupimmsd_mask:
-  case X86::BI__builtin_ia32_fixupimmsd_maskz:
-  case X86::BI__builtin_ia32_fixupimmss_mask:
-  case X86::BI__builtin_ia32_fixupimmss_maskz:
-  case X86::BI__builtin_ia32_getmantsd_round_mask:
-  case X86::BI__builtin_ia32_getmantss_round_mask:
-  case X86::BI__builtin_ia32_getmantsh_round_mask:
-  case X86::BI__builtin_ia32_rangepd512_mask:
-  case X86::BI__builtin_ia32_rangeps512_mask:
-  case X86::BI__builtin_ia32_rangesd128_round_mask:
-  case X86::BI__builtin_ia32_rangess128_round_mask:
-  case X86::BI__builtin_ia32_reducesd_mask:
-  case X86::BI__builtin_ia32_reducess_mask:
-  case X86::BI__builtin_ia32_reducesh_mask:
-  case X86::BI__builtin_ia32_rndscalesd_round_mask:
-  case X86::BI__builtin_ia32_rndscaless_round_mask:
-  case X86::BI__builtin_ia32_rndscalesh_round_mask:
-    ArgNum = 5;
-    break;
-  case X86::BI__builtin_ia32_vcvtsd2si64:
-  case X86::BI__builtin_ia32_vcvtsd2si32:
-  case X86::BI__builtin_ia32_vcvtsd2usi32:
-  case X86::BI__builtin_ia32_vcvtsd2usi64:
-  case X86::BI__builtin_ia32_vcvtss2si32:
-  case X86::BI__builtin_ia32_vcvtss2si64:
-  case X86::BI__builtin_ia32_vcvtss2usi32:
-  case X86::BI__builtin_ia32_vcvtss2usi64:
-  case X86::BI__builtin_ia32_vcvtsh2si32:
-  case X86::BI__builtin_ia32_vcvtsh2si64:
-  case X86::BI__builtin_ia32_vcvtsh2usi32:
-  case X86::BI__builtin_ia32_vcvtsh2usi64:
-  case X86::BI__builtin_ia32_sqrtpd512:
-  case X86::BI__builtin_ia32_sqrtps512:
-  case X86::BI__builtin_ia32_sqrtph512:
-    ArgNum = 1;
-    HasRC = true;
-    break;
-  case X86::BI__builtin_ia32_addph512:
-  case X86::BI__builtin_ia32_divph512:
-  case X86::BI__builtin_ia32_mulph512:
-  case X86::BI__builtin_ia32_subph512:
-  case X86::BI__builtin_ia32_addpd512:
-  case X86::BI__builtin_ia32_addps512:
-  case X86::BI__builtin_ia32_divpd512:
-  case X86::BI__builtin_ia32_divps512:
-  case X86::BI__builtin_ia32_mulpd512:
-  case X86::BI__builtin_ia32_mulps512:
-  case X86::BI__builtin_ia32_subpd512:
-  case X86::BI__builtin_ia32_subps512:
-  case X86::BI__builtin_ia32_cvtsi2sd64:
-  case X86::BI__builtin_ia32_cvtsi2ss32:
-  case X86::BI__builtin_ia32_cvtsi2ss64:
-  case X86::BI__builtin_ia32_cvtusi2sd64:
-  case X86::BI__builtin_ia32_cvtusi2ss32:
-  case X86::BI__builtin_ia32_cvtusi2ss64:
-  case X86::BI__builtin_ia32_vcvtusi2sh:
-  case X86::BI__builtin_ia32_vcvtusi642sh:
-  case X86::BI__builtin_ia32_vcvtsi2sh:
-  case X86::BI__builtin_ia32_vcvtsi642sh:
-    ArgNum = 2;
-    HasRC = true;
-    break;
-  case X86::BI__builtin_ia32_cvtdq2ps512_mask:
-  case X86::BI__builtin_ia32_cvtudq2ps512_mask:
-  case X86::BI__builtin_ia32_vcvtpd2ph512_mask:
-  case X86::BI__builtin_ia32_vcvtps2phx512_mask:
-  case X86::BI__builtin_ia32_cvtpd2ps512_mask:
-  case X86::BI__builtin_ia32_cvtpd2dq512_mask:
-  case X86::BI__builtin_ia32_cvtpd2qq512_mask:
-  case X86::BI__builtin_ia32_cvtpd2udq512_mask:
-  case X86::BI__builtin_ia32_cvtpd2uqq512_mask:
-  case X86::BI__builtin_ia32_cvtps2dq512_mask:
-  case X86::BI__builtin_ia32_cvtps2qq512_mask:
-  case X86::BI__builtin_ia32_cvtps2udq512_mask:
-  case X86::BI__builtin_ia32_cvtps2uqq512_mask:
-  case X86::BI__builtin_ia32_cvtqq2pd512_mask:
-  case X86::BI__builtin_ia32_cvtqq2ps512_mask:
-  case X86::BI__builtin_ia32_cvtuqq2pd512_mask:
-  case X86::BI__builtin_ia32_cvtuqq2ps512_mask:
-  case X86::BI__builtin_ia32_vcvtdq2ph512_mask:
-  case X86::BI__builtin_ia32_vcvtudq2ph512_mask:
-  case X86::BI__builtin_ia32_vcvtw2ph512_mask:
-  case X86::BI__builtin_ia32_vcvtuw2ph512_mask:
-  case X86::BI__builtin_ia32_vcvtph2w512_mask:
-  case X86::BI__builtin_ia32_vcvtph2uw512_mask:
-  case X86::BI__builtin_ia32_vcvtph2dq512_mask:
-  case X86::BI__builtin_ia32_vcvtph2udq512_mask:
-  case X86::BI__builtin_ia32_vcvtph2qq512_mask:
-  case X86::BI__builtin_ia32_vcvtph2uqq512_mask:
-  case X86::BI__builtin_ia32_vcvtqq2ph512_mask:
-  case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
-    ArgNum = 3;
-    HasRC = true;
-    break;
-  case X86::BI__builtin_ia32_addsh_round_mask:
-  case X86::BI__builtin_ia32_addss_round_mask:
-  case X86::BI__builtin_ia32_addsd_round_mask:
-  case X86::BI__builtin_ia32_divsh_round_mask:
-  case X86::BI__builtin_ia32_divss_round_mask:
-  case X86::BI__builtin_ia32_divsd_round_mask:
-  case X86::BI__builtin_ia32_mulsh_round_mask:
-  case X86::BI__builtin_ia32_mulss_round_mask:
-  case X86::BI__builtin_ia32_mulsd_round_mask:
-  case X86::BI__builtin_ia32_subsh_round_mask:
-  case X86::BI__builtin_ia32_subss_round_mask:
-  case X86::BI__builtin_ia32_subsd_round_mask:
-  case X86::BI__builtin_ia32_scalefph512_mask:
-  case X86::BI__builtin_ia32_scalefpd512_mask:
-  case X86::BI__builtin_ia32_scalefps512_mask:
-  case X86::BI__builtin_ia32_scalefsd_round_mask:
-  case X86::BI__builtin_ia32_scalefss_round_mask:
-  case X86::BI__builtin_ia32_scalefsh_round_mask:
-  case X86::BI__builtin_ia32_cvtsd2ss_round_mask:
-  case X86::BI__builtin_ia32_vcvtss2sh_round_mask:
-  case X86::BI__builtin_ia32_vcvtsd2sh_round_mask:
-  case X86::BI__builtin_ia32_sqrtsd_round_mask:
-  case X86::BI__builtin_ia32_sqrtss_round_mask:
-  case X86::BI__builtin_ia32_sqrtsh_round_mask:
-  case X86::BI__builtin_ia32_vfmaddsd3_mask:
-  case X86::BI__builtin_ia32_vfmaddsd3_maskz:
-  case X86::BI__builtin_ia32_vfmaddsd3_mask3:
-  case X86::BI__builtin_ia32_vfmaddss3_mask:
-  case X86::BI__builtin_ia32_vfmaddss3_maskz:
-  case X86::BI__builtin_ia32_vfmaddss3_mask3:
-  case X86::BI__builtin_ia32_vfmaddsh3_mask:
-  case X86::BI__builtin_ia32_vfmaddsh3_maskz:
-  case X86::BI__builtin_ia32_vfmaddsh3_mask3:
-  case X86::BI__builtin_ia32_vfmaddpd512_mask:
-  case X86::BI__builtin_ia32_vfmaddpd512_maskz:
-  case X86::BI__builtin_ia32_vfmaddpd512_mask3:
-  case X86::BI__builtin_ia32_vfmsubpd512_mask3:
-  case X86::BI__builtin_ia32_vfmaddps512_mask:
-  case X86::BI__builtin_ia32_vfmaddps512_maskz:
-  case X86::BI__builtin_ia32_vfmaddps512_mask3:
-  case X86::BI__builtin_ia32_vfmsubps512_mask3:
-  case X86::BI__builtin_ia32_vfmaddph512_mask:
-  case X86::BI__builtin_ia32_vfmaddph512_maskz:
-  case X86::BI__builtin_ia32_vfmaddph512_mask3:
-  case X86::BI__builtin_ia32_vfmsubph512_mask3:
-  case X86::BI__builtin_ia32_vfmaddsubpd512_mask:
-  case X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
-  case X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
-  case X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
-  case X86::BI__builtin_ia32_vfmaddsubps512_mask:
-  case X86::BI__builtin_ia32_vfmaddsubps512_maskz:
-  case X86::BI__builtin_ia32_vfmaddsubps512_mask3:
-  case X86::BI__builtin_ia32_vfmsubaddps512_mask3:
-  case X86::BI__builtin_ia32_vfmaddsubph512_mask:
-  case X86::BI__builtin_ia32_vfmaddsubph512_maskz:
-  case X86::BI__builtin_ia32_vfmaddsubph512_mask3:
-  case X86::BI__builtin_ia32_vfmsubaddph512_mask3:
-  case X86::BI__builtin_ia32_vfmaddcsh_mask:
-  case X86::BI__builtin_ia32_vfmaddcsh_round_mask:
-  case X86::BI__builtin_ia32_vfmaddcsh_round_mask3:
-  case X86::BI__builtin_ia32_vfmaddcph512_mask:
-  case X86::BI__builtin_ia32_vfmaddcph512_maskz:
-  case X86::BI__builtin_ia32_vfmaddcph512_mask3:
-  case X86::BI__builtin_ia32_vfcmaddcsh_mask:
-  case X86::BI__builtin_ia32_vfcmaddcsh_round_mask:
-  case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3:
-  case X86::BI__builtin_ia32_vfcmaddcph512_mask:
-  case X86::BI__builtin_ia32_vfcmaddcph512_maskz:
-  case X86::BI__builtin_ia32_vfcmaddcph512_mask3:
-  case X86::BI__builtin_ia32_vfmulcsh_mask:
-  case X86::BI__builtin_ia32_vfmulcph512_mask:
-  case X86::BI__builtin_ia32_vfcmulcsh_mask:
-  case X86::BI__builtin_ia32_vfcmulcph512_mask:
-    ArgNum = 4;
-    HasRC = true;
-    break;
-  }
-
-  llvm::APSInt Result;
-
-  // We can't check the value of a dependent argument.
-  Expr *Arg = TheCall->getArg(ArgNum);
-  if (Arg->isTypeDependent() || Arg->isValueDependent())
-    return false;
-
-  // Check constant-ness first.
-  if (BuiltinConstantArg(TheCall, ArgNum, Result))
-    return true;
-
-  // Make sure rounding mode is either ROUND_CUR_DIRECTION or ROUND_NO_EXC bit
-  // is set. If the intrinsic has rounding control(bits 1:0), make sure its only
-  // combined with ROUND_NO_EXC. If the intrinsic does not have rounding
-  // control, allow ROUND_NO_EXC and ROUND_CUR_DIRECTION together.
-  if (Result == 4/*ROUND_CUR_DIRECTION*/ ||
-      Result == 8/*ROUND_NO_EXC*/ ||
-      (!HasRC && Result == 12/*ROUND_CUR_DIRECTION|ROUND_NO_EXC*/) ||
-      (HasRC && Result.getZExtValue() >= 8 && Result.getZExtValue() <= 11))
-    return false;
-
-  return Diag(TheCall->getBeginLoc(), diag::err_x86_builtin_invalid_rounding)
-         << Arg->getSourceRange();
-}
-
-// Check if the gather/scatter scale is legal.
-bool Sema::CheckX86BuiltinGatherScatterScale(unsigned BuiltinID,
-                                             CallExpr *TheCall) {
-  unsigned ArgNum = 0;
-  switch (BuiltinID) {
-  default:
-    return false;
-  case X86::BI__builtin_ia32_gatherpfdpd:
-  case X86::BI__builtin_ia32_gatherpfdps:
-  case X86::BI__builtin_ia32_gatherpfqpd:
-  case X86::BI__builtin_ia32_gatherpfqps:
-  case X86::BI__builtin_ia32_scatterpfdpd:
-  case X86::BI__builtin_ia32_scatterpfdps:
-  case X86::BI__builtin_ia32_scatterpfqpd:
-  case X86::BI__builtin_ia32_scatterpfqps:
-    ArgNum = 3;
-    break;
-  case X86::BI__builtin_ia32_gatherd_pd:
-  case X86::BI__builtin_ia32_gatherd_pd256:
-  case X86::BI__builtin_ia32_gatherq_pd:
-  case X86::BI__builtin_ia32_gatherq_pd256:
-  case X86::BI__builtin_ia32_gatherd_ps:
-  case X86::BI__builtin_ia32_gatherd_ps256:
-  case X86::BI__builtin_ia32_gatherq_ps:
-  case X86::BI__builtin_ia32_gatherq_ps256:
-  case X86::BI__builtin_ia32_gatherd_q:
-  case X86::BI__builtin_ia32_gatherd_q256:
-  case X86::BI__builtin_ia32_gatherq_q:
-  case X86::BI__builtin_ia32_gatherq_q256:
-  case X86::BI__builtin_ia32_gatherd_d:
-  case X86::BI__builtin_ia32_gatherd_d256:
-  case X86::BI__builtin_ia32_gatherq_d:
-  case X86::BI__builtin_ia32_gatherq_d256:
-  case X86::BI__builtin_ia32_gather3div2df:
-  case X86::BI__builtin_ia32_gather3div2di:
-  case X86::BI__builtin_ia32_gather3div4df:
-  case X86::BI__builtin_ia32_gather3div4di:
-  case X86::BI__builtin_ia32_gather3div4sf:
-  case X86::BI__builtin_ia32_gather3div4si:
-  case X86::BI__builtin_ia32_gather3div8sf:
-  case X86::BI__builtin_ia32_gather3div8si:
-  case X86::BI__builtin_ia32_gather3siv2df:
-  case X86::BI__builtin_ia32_gather3siv2di:
-  case X86::BI__builtin_ia32_gather3siv4df:
-  case X86::BI__builtin_ia32_gather3siv4di:
-  case X86::BI__builtin_ia32_gather3siv4sf:
-  case X86::BI__builtin_ia32_gather3siv4si:
-  case X86::BI__builtin_ia32_gather3siv8sf:
-  case X86::BI__builtin_ia32_gather3siv8si:
-  case X86::BI__builtin_ia32_gathersiv8df:
-  case X86::BI__builtin_ia32_gathersiv16sf:
-  case X86::BI__builtin_ia32_gatherdiv8df:
-  case X86::BI__builtin_ia32_gatherdiv16sf:
-  case X86::BI__builtin_ia32_gathersiv8di:
-  case X86::BI__builtin_ia32_gathersiv16si:
-  case X86::BI__builtin_ia32_gatherdiv8di:
-  case X86::BI__builtin_ia32_gatherdiv16si:
-  case X86::BI__builtin_ia32_scatterdiv2df:
-  case X86::BI__builtin_ia32_scatterdiv2di:
-  case X86::BI__builtin_ia32_scatterdiv4df:
-  case X86::BI__builtin_ia32_scatterdiv4di:
-  case X86::BI__builtin_ia32_scatterdiv4sf:
-  case X86::BI__builtin_ia32_scatterdiv4si:
-  case X86::BI__builtin_ia32_scatterdiv8sf:
-  case X86::BI__builtin_ia32_scatterdiv8si:
-  case X86::BI__builtin_ia32_scattersiv2df:
-  case X86::BI__builtin_ia32_scattersiv2di:
-  case X86::BI__builtin_ia32_scattersiv4df:
-  case X86::BI__builtin_ia32_scattersiv4di:
-  case X86::BI__builtin_ia32_scattersiv4sf:
-  case X86::BI__builtin_ia32_scattersiv4si:
-  case X86::BI__builtin_ia32_scattersiv8sf:
-  case X86::BI__builtin_ia32_scattersiv8si:
-  case X86::BI__builtin_ia32_scattersiv8df:
-  case X86::BI__builtin_ia32_scattersiv16sf:
-  case X86::BI__builtin_ia32_scatterdiv8df:
-  case X86::BI__builtin_ia32_scatterdiv16sf:
-  case X86::BI__builtin_ia32_scattersiv8di:
-  case X86::BI__builtin_ia32_scattersiv16si:
-  case X86::BI__builtin_ia32_scatterdiv8di:
-  case X86::BI__builtin_ia32_scatterdiv16si:
-    ArgNum = 4;
-    break;
-  }
-
-  llvm::APSInt Result;
-
-  // We can't check the value of a dependent argument.
-  Expr *Arg = TheCall->getArg(ArgNum);
-  if (Arg->isTypeDependent() || Arg->isValueDependent())
-    return false;
-
-  // Check constant-ness first.
-  if (BuiltinConstantArg(TheCall, ArgNum, Result))
-    return true;
-
-  if (Result == 1 || Result == 2 || Result == 4 || Result == 8)
-    return false;
-
-  return Diag(TheCall->getBeginLoc(), diag::err_x86_builtin_invalid_scale)
-         << Arg->getSourceRange();
-}
-
-enum { TileRegLow = 0, TileRegHigh = 7 };
-
-bool Sema::CheckX86BuiltinTileArgumentsRange(CallExpr *TheCall,
-                                             ArrayRef<int> ArgNums) {
-  for (int ArgNum : ArgNums) {
-    if (BuiltinConstantArgRange(TheCall, ArgNum, TileRegLow, TileRegHigh))
-      return true;
-  }
-  return false;
-}
-
-bool Sema::CheckX86BuiltinTileDuplicate(CallExpr *TheCall,
-                                        ArrayRef<int> ArgNums) {
-  // Because the max number of tile register is TileRegHigh + 1, so here we use
-  // each bit to represent the usage of them in bitset.
-  std::bitset<TileRegHigh + 1> ArgValues;
-  for (int ArgNum : ArgNums) {
-    Expr *Arg = TheCall->getArg(ArgNum);
-    if (Arg->isTypeDependent() || Arg->isValueDependent())
-      continue;
-
-    llvm::APSInt Result;
-    if (BuiltinConstantArg(TheCall, ArgNum, Result))
-      return true;
-    int ArgExtValue = Result.getExtValue();
-    assert((ArgExtValue >= TileRegLow && ArgExtValue <= TileRegHigh) &&
-           "Incorrect tile register num.");
-    if (ArgValues.test(ArgExtValue))
-      return Diag(TheCall->getBeginLoc(),
-                  diag::err_x86_builtin_tile_arg_duplicate)
-             << TheCall->getArg(ArgNum)->getSourceRange();
-    ArgValues.set(ArgExtValue);
-  }
-  return false;
-}
-
-bool Sema::CheckX86BuiltinTileRangeAndDuplicate(CallExpr *TheCall,
-                                                ArrayRef<int> ArgNums) {
-  return CheckX86BuiltinTileArgumentsRange(TheCall, ArgNums) ||
-         CheckX86BuiltinTileDuplicate(TheCall, ArgNums);
-}
-
-bool Sema::CheckX86BuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
-  switch (BuiltinID) {
-  default:
-    return false;
-  case X86::BI__builtin_ia32_tileloadd64:
-  case X86::BI__builtin_ia32_tileloaddt164:
-  case X86::BI__builtin_ia32_tilestored64:
-  case X86::BI__builtin_ia32_tilezero:
-    return CheckX86BuiltinTileArgumentsRange(TheCall, 0);
-  case X86::BI__builtin_ia32_tdpbssd:
-  case X86::BI__builtin_ia32_tdpbsud:
-  case X86::BI__builtin_ia32_tdpbusd:
-  case X86::BI__builtin_ia32_tdpbuud:
-  case X86::BI__builtin_ia32_tdpbf16ps:
-  case X86::BI__builtin_ia32_tdpfp16ps:
-  case X86::BI__builtin_ia32_tcmmimfp16ps:
-  case X86::BI__builtin_ia32_tcmmrlfp16ps:
-    return CheckX86BuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2});
-  }
-}
-static bool isX86_32Builtin(unsigned BuiltinID) {
-  // These builtins only work on x86-32 targets.
-  switch (BuiltinID) {
-  case X86::BI__builtin_ia32_readeflags_u32:
-  case X86::BI__builtin_ia32_writeeflags_u32:
-    return true;
+    return checkArgCountAtMost(TheCall, 3);
   }
 
   return false;
 }
 
-bool Sema::CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
-                                       CallExpr *TheCall) {
-  // Check for 32-bit only builtins on a 64-bit target.
-  const llvm::Triple &TT = TI.getTriple();
-  if (TT.getArch() != llvm::Triple::x86 && isX86_32Builtin(BuiltinID))
-    return Diag(TheCall->getCallee()->getBeginLoc(),
-                diag::err_32_bit_builtin_64_bit_tgt);
-
-  // If the intrinsic has rounding or SAE make sure its valid.
-  if (CheckX86BuiltinRoundingOrSAE(BuiltinID, TheCall))
-    return true;
-
-  // If the intrinsic has a gather/scatter scale immediate make sure its valid.
-  if (CheckX86BuiltinGatherScatterScale(BuiltinID, TheCall))
-    return true;
-
-  // If the intrinsic has a tile arguments, make sure they are valid.
-  if (CheckX86BuiltinTileArguments(BuiltinID, TheCall))
-    return true;
-
-  // For intrinsics which take an immediate value as part of the instruction,
-  // range check them here.
-  int i = 0, l = 0, u = 0;
-  switch (BuiltinID) {
-  default:
-    return false;
-  case X86::BI__builtin_ia32_vec_ext_v2si:
-  case X86::BI__builtin_ia32_vec_ext_v2di:
-  case X86::BI__builtin_ia32_vextractf128_pd256:
-  case X86::BI__builtin_ia32_vextractf128_ps256:
-  case X86::BI__builtin_ia32_vextractf128_si256:
-  case X86::BI__builtin_ia32_extract128i256:
-  case X86::BI__builtin_ia32_extractf64x4_mask:
-  case X86::BI__builtin_ia32_extracti64x4_mask:
-  case X86::BI__builtin_ia32_extractf32x8_mask:
-  case X86::BI__builtin_ia32_extracti32x8_mask:
-  case X86::BI__builtin_ia32_extractf64x2_256_mask:
-  case X86::BI__builtin_ia32_extracti64x2_256_mask:
-  case X86::BI__builtin_ia32_extractf32x4_256_mask:
-  case X86::BI__builtin_ia32_extracti32x4_256_mask:
-    i = 1; l = 0; u = 1;
-    break;
-  case X86::BI__builtin_ia32_vec_set_v2di:
-  case X86::BI__builtin_ia32_vinsertf128_pd256:
-  case X86::BI__builtin_ia32_vinsertf128_ps256:
-  case X86::BI__builtin_ia32_vinsertf128_si256:
-  case X86::BI__builtin_ia32_insert128i256:
-  case X86::BI__builtin_ia32_insertf32x8:
-  case X86::BI__builtin_ia32_inserti32x8:
-  case X86::BI__builtin_ia32_insertf64x4:
-  case X86::BI__builtin_ia32_inserti64x4:
-  case X86::BI__builtin_ia32_insertf64x2_256:
-  case X86::BI__builtin_ia32_inserti64x2_256:
-  case X86::BI__builtin_ia32_insertf32x4_256:
-  case X86::BI__builtin_ia32_inserti32x4_256:
-    i = 2; l = 0; u = 1;
-    break;
-  case X86::BI__builtin_ia32_vpermilpd:
-  case X86::BI__builtin_ia32_vec_ext_v4hi:
-  case X86::BI__builtin_ia32_vec_ext_v4si:
-  case X86::BI__builtin_ia32_vec_ext_v4sf:
-  case X86::BI__builtin_ia32_vec_ext_v4di:
-  case X86::BI__builtin_ia32_extractf32x4_mask:
-  case X86::BI__builtin_ia32_extracti32x4_mask:
-  case X86::BI__builtin_ia32_extractf64x2_512_mask:
-  case X86::BI__builtin_ia32_extracti64x2_512_mask:
-    i = 1; l = 0; u = 3;
-    break;
-  case X86::BI_mm_prefetch:
-  case X86::BI__builtin_ia32_vec_ext_v8hi:
-  case X86::BI__builtin_ia32_vec_ext_v8si:
-    i = 1; l = 0; u = 7;
-    break;
-  case X86::BI__builtin_ia32_sha1rnds4:
-  case X86::BI__builtin_ia32_blendpd:
-  case X86::BI__builtin_ia32_shufpd:
-  case X86::BI__builtin_ia32_vec_set_v4hi:
-  case X86::BI__builtin_ia32_vec_set_v4si:
-  case X86::BI__builtin_ia32_vec_set_v4di:
-  case X86::BI__builtin_ia32_shuf_f32x4_256:
-  case X86::BI__builtin_ia32_shuf_f64x2_256:
-  case X86::BI__builtin_ia32_shuf_i32x4_256:
-  case X86::BI__builtin_ia32_shuf_i64x2_256:
-  case X86::BI__builtin_ia32_insertf64x2_512:
-  case X86::BI__builtin_ia32_inserti64x2_512:
-  case X86::BI__builtin_ia32_insertf32x4:
-  case X86::BI__builtin_ia32_inserti32x4:
-    i = 2; l = 0; u = 3;
-    break;
-  case X86::BI__builtin_ia32_vpermil2pd:
-  case X86::BI__builtin_ia32_vpermil2pd256:
-  case X86::BI__builtin_ia32_vpermil2ps:
-  case X86::BI__builtin_ia32_vpermil2ps256:
-    i = 3; l = 0; u = 3;
-    break;
-  case X86::BI__builtin_ia32_cmpb128_mask:
-  case X86::BI__builtin_ia32_cmpw128_mask:
-  case X86::BI__builtin_ia32_cmpd128_mask:
-  case X86::BI__builtin_ia32_cmpq128_mask:
-  case X86::BI__builtin_ia32_cmpb256_mask:
-  case X86::BI__builtin_ia32_cmpw256_mask:
-  case X86::BI__builtin_ia32_cmpd256_mask:
-  case X86::BI__builtin_ia32_cmpq256_mask:
-  case X86::BI__builtin_ia32_cmpb512_mask:
-  case X86::BI__builtin_ia32_cmpw512_mask:
-  case X86::BI__builtin_ia32_cmpd512_mask:
-  case X86::BI__builtin_ia32_cmpq512_mask:
-  case X86::BI__builtin_ia32_ucmpb128_mask:
-  case X86::BI__builtin_ia32_ucmpw128_mask:
-  case X86::BI__builtin_ia32_ucmpd128_mask:
-  case X86::BI__builtin_ia32_ucmpq128_mask:
-  case X86::BI__builtin_ia32_ucmpb256_mask:
-  case X86::BI__builtin_ia32_ucmpw256_mask:
-  case X86::BI__builtin_ia32_ucmpd256_mask:
-  case X86::BI__builtin_ia32_ucmpq256_mask:
-  case X86::BI__builtin_ia32_ucmpb512_mask:
-  case X86::BI__builtin_ia32_ucmpw512_mask:
-  case X86::BI__builtin_ia32_ucmpd512_mask:
-  case X86::BI__builtin_ia32_ucmpq512_mask:
-  case X86::BI__builtin_ia32_vpcomub:
-  case X86::BI__builtin_ia32_vpcomuw:
-  case X86::BI__builtin_ia32_vpcomud:
-  case X86::BI__builtin_ia32_vpcomuq:
-  case X86::BI__builtin_ia32_vpcomb:
-  case X86::BI__builtin_ia32_vpcomw:
-  case X86::BI__builtin_ia32_vpcomd:
-  case X86::BI__builtin_ia32_vpcomq:
-  case X86::BI__builtin_ia32_vec_set_v8hi:
-  case X86::BI__builtin_ia32_vec_set_v8si:
-    i = 2; l = 0; u = 7;
-    break;
-  case X86::BI__builtin_ia32_vpermilpd256:
-  case X86::BI__builtin_ia32_roundps:
-  case X86::BI__builtin_ia32_roundpd:
-  case X86::BI__builtin_ia32_roundps256:
-  case X86::BI__builtin_ia32_roundpd256:
-  case X86::BI__builtin_ia32_getmantpd128_mask:
-  case X86::BI__builtin_ia32_getmantpd256_mask:
-  case X86::BI__builtin_ia32_getmantps128_mask:
-  case X86::BI__builtin_ia32_getmantps256_mask:
-  case X86::BI__builtin_ia32_getmantpd512_mask:
-  case X86::BI__builtin_ia32_getmantps512_mask:
-  case X86::BI__builtin_ia32_getmantph128_mask:
-  case X86::BI__builtin_ia32_getmantph256_mask:
-  case X86::BI__builtin_ia32_getmantph512_mask:
-  case X86::BI__builtin_ia32_vec_ext_v16qi:
-  case X86::BI__builtin_ia32_vec_ext_v16hi:
-    i = 1; l = 0; u = 15;
-    break;
-  case X86::BI__builtin_ia32_pblendd128:
-  case X86::BI__builtin_ia32_blendps:
-  case X86::BI__builtin_ia32_blendpd256:
-  case X86::BI__builtin_ia32_shufpd256:
-  case X86::BI__builtin_ia32_roundss:
-  case X86::BI__builtin_ia32_roundsd:
-  case X86::BI__builtin_ia32_rangepd128_mask:
-  case X86::BI__builtin_ia32_rangepd256_mask:
-  case X86::BI__builtin_ia32_rangepd512_mask:
-  case X86::BI__builtin_ia32_rangeps128_mask:
-  case X86::BI__builtin_ia32_rangeps256_mask:
-  case X86::BI__builtin_ia32_rangeps512_mask:
-  case X86::BI__builtin_ia32_getmantsd_round_mask:
-  case X86::BI__builtin_ia32_getmantss_round_mask:
-  case X86::BI__builtin_ia32_getmantsh_round_mask:
-  case X86::BI__builtin_ia32_vec_set_v16qi:
-  case X86::BI__builtin_ia32_vec_set_v16hi:
-    i = 2; l = 0; u = 15;
-    break;
-  case X86::BI__builtin_ia32_vec_ext_v32qi:
-    i = 1; l = 0; u = 31;
-    break;
-  case X86::BI__builtin_ia32_cmpps:
-  case X86::BI__builtin_ia32_cmpss:
-  case X86::BI__builtin_ia32_cmppd:
-  case X86::BI__builtin_ia32_cmpsd:
-  case X86::BI__builtin_ia32_cmpps256:
-  case X86::BI__builtin_ia32_cmppd256:
-  case X86::BI__builtin_ia32_cmpps128_mask:
-  case X86::BI__builtin_ia32_cmppd128_mask:
-  case X86::BI__builtin_ia32_cmpps256_mask:
-  case X86::BI__builtin_ia32_cmppd256_mask:
-  case X86::BI__builtin_ia32_cmpps512_mask:
-  case X86::BI__builtin_ia32_cmppd512_mask:
-  case X86::BI__builtin_ia32_cmpsd_mask:
-  case X86::BI__builtin_ia32_cmpss_mask:
-  case X86::BI__builtin_ia32_vec_set_v32qi:
-    i = 2; l = 0; u = 31;
-    break;
-  case X86::BI__builtin_ia32_permdf256:
-  case X86::BI__builtin_ia32_permdi256:
-  case X86::BI__builtin_ia32_permdf512:
-  case X86::BI__builtin_ia32_permdi512:
-  case X86::BI__builtin_ia32_vpermilps:
-  case X86::BI__builtin_ia32_vpermilps256:
-  case X86::BI__builtin_ia32_vpermilpd512:
-  case X86::BI__builtin_ia32_vpermilps512:
-  case X86::BI__builtin_ia32_pshufd:
-  case X86::BI__builtin_ia32_pshufd256:
-  case X86::BI__builtin_ia32_pshufd512:
-  case X86::BI__builtin_ia32_pshufhw:
-  case X86::BI__builtin_ia32_pshufhw256:
-  case X86::BI__builtin_ia32_pshufhw512:
-  case X86::BI__builtin_ia32_pshuflw:
-  case X86::BI__builtin_ia32_pshuflw256:
-  case X86::BI__builtin_ia32_pshuflw512:
-  case X86::BI__builtin_ia32_vcvtps2ph:
-  case X86::BI__builtin_ia32_vcvtps2ph_mask:
-  case X86::BI__builtin_ia32_vcvtps2ph256:
-  case X86::BI__builtin_ia32_vcvtps2ph256_mask:
-  case X86::BI__builtin_ia32_vcvtps2ph512_mask:
-  case X86::BI__builtin_ia32_rndscaleps_128_mask:
-  case X86::BI__builtin_ia32_rndscalepd_128_mask:
-  case X86::BI__builtin_ia32_rndscaleps_256_mask:
-  case X86::BI__builtin_ia32_rndscalepd_256_mask:
-  case X86::BI__builtin_ia32_rndscaleps_mask:
-  case X86::BI__builtin_ia32_rndscalepd_mask:
-  case X86::BI__builtin_ia32_rndscaleph_mask:
-  case X86::BI__builtin_ia32_reducepd128_mask:
-  case X86::BI__builtin_ia32_reducepd256_mask:
-  case X86::BI__builtin_ia32_reducepd512_mask:
-  case X86::BI__builtin_ia32_reduceps128_mask:
-  case X86::BI__builtin_ia32_reduceps256_mask:
-  case X86::BI__builtin_ia32_reduceps512_mask:
-  case X86::BI__builtin_ia32_reduceph128_mask:
-  case X86::BI__builtin_ia32_reduceph256_mask:
-  case X86::BI__builtin_ia32_reduceph512_mask:
-  case X86::BI__builtin_ia32_prold512:
-  case X86::BI__builtin_ia32_prolq512:
-  case X86::BI__builtin_ia32_prold128:
-  case X86::BI__builtin_ia32_prold256:
-  case X86::BI__builtin_ia32_prolq128:
-  case X86::BI__builtin_ia32_prolq256:
-  case X86::BI__builtin_ia32_prord512:
-  case X86::BI__builtin_ia32_prorq512:
-  case X86::BI__builtin_ia32_prord128:
-  case X86::BI__builtin_ia32_prord256:
-  case X86::BI__builtin_ia32_prorq128:
-  case X86::BI__builtin_ia32_prorq256:
-  case X86::BI__builtin_ia32_fpclasspd128_mask:
-  case X86::BI__builtin_ia32_fpclasspd256_mask:
-  case X86::BI__builtin_ia32_fpclassps128_mask:
-  case X86::BI__builtin_ia32_fpclassps256_mask:
-  case X86::BI__builtin_ia32_fpclassps512_mask:
-  case X86::BI__builtin_ia32_fpclasspd512_mask:
-  case X86::BI__builtin_ia32_fpclassph128_mask:
-  case X86::BI__builtin_ia32_fpclassph256_mask:
-  case X86::BI__builtin_ia32_fpclassph512_mask:
-  case X86::BI__builtin_ia32_fpclasssd_mask:
-  case X86::BI__builtin_ia32_fpclassss_mask:
-  case X86::BI__builtin_ia32_fpclasssh_mask:
-  case X86::BI__builtin_ia32_pslldqi128_byteshift:
-  case X86::BI__builtin_ia32_pslldqi256_byteshift:
-  case X86::BI__builtin_ia32_pslldqi512_byteshift:
-  case X86::BI__builtin_ia32_psrldqi128_byteshift:
-  case X86::BI__builtin_ia32_psrldqi256_byteshift:
-  case X86::BI__builtin_ia32_psrldqi512_byteshift:
-  case X86::BI__builtin_ia32_kshiftliqi:
-  case X86::BI__builtin_ia32_kshiftlihi:
-  case X86::BI__builtin_ia32_kshiftlisi:
-  case X86::BI__builtin_ia32_kshiftlidi:
-  case X86::BI__builtin_ia32_kshiftriqi:
-  case X86::BI__builtin_ia32_kshiftrihi:
-  case X86::BI__builtin_ia32_kshiftrisi:
-  case X86::BI__builtin_ia32_kshiftridi:
-    i = 1; l = 0; u = 255;
-    break;
-  case X86::BI__builtin_ia32_vperm2f128_pd256:
-  case X86::BI__builtin_ia32_vperm2f128_ps256:
-  case X86::BI__builtin_ia32_vperm2f128_si256:
-  case X86::BI__builtin_ia32_permti256:
-  case X86::BI__builtin_ia32_pblendw128:
-  case X86::BI__builtin_ia32_pblendw256:
-  case X86::BI__builtin_ia32_blendps256:
-  case X86::BI__builtin_ia32_pblendd256:
-  case X86::BI__builtin_ia32_palignr128:
-  case X86::BI__builtin_ia32_palignr256:
-  case X86::BI__builtin_ia32_palignr512:
-  case X86::BI__builtin_ia32_alignq512:
-  case X86::BI__builtin_ia32_alignd512:
-  case X86::BI__builtin_ia32_alignd128:
-  case X86::BI__builtin_ia32_alignd256:
-  case X86::BI__builtin_ia32_alignq128:
-  case X86::BI__builtin_ia32_alignq256:
-  case X86::BI__builtin_ia32_vcomisd:
-  case X86::BI__builtin_ia32_vcomiss:
-  case X86::BI__builtin_ia32_shuf_f32x4:
-  case X86::BI__builtin_ia32_shuf_f64x2:
-  case X86::BI__builtin_ia32_shuf_i32x4:
-  case X86::BI__builtin_ia32_shuf_i64x2:
-  case X86::BI__builtin_ia32_shufpd512:
-  case X86::BI__builtin_ia32_shufps:
-  case X86::BI__builtin_ia32_shufps256:
-  case X86::BI__builtin_ia32_shufps512:
-  case X86::BI__builtin_ia32_dbpsadbw128:
-  case X86::BI__builtin_ia32_dbpsadbw256:
-  case X86::BI__builtin_ia32_dbpsadbw512:
-  case X86::BI__builtin_ia32_vpshldd128:
-  case X86::BI__builtin_ia32_vpshldd256:
-  case X86::BI__builtin_ia32_vpshldd512:
-  case X86::BI__builtin_ia32_vpshldq128:
-  case X86::BI__builtin_ia32_vpshldq256:
-  case X86::BI__builtin_ia32_vpshldq512:
-  case X86::BI__builtin_ia32_vpshldw128:
-  case X86::BI__builtin_ia32_vpshldw256:
-  case X86::BI__builtin_ia32_vpshldw512:
-  case X86::BI__builtin_ia32_vpshrdd128:
-  case X86::BI__builtin_ia32_vpshrdd256:
-  case X86::BI__builtin_ia32_vpshrdd512:
-  case X86::BI__builtin_ia32_vpshrdq128:
-  case X86::BI__builtin_ia32_vpshrdq256:
-  case X86::BI__builtin_ia32_vpshrdq512:
-  case X86::BI__builtin_ia32_vpshrdw128:
-  case X86::BI__builtin_ia32_vpshrdw256:
-  case X86::BI__builtin_ia32_vpshrdw512:
-    i = 2; l = 0; u = 255;
-    break;
-  case X86::BI__builtin_ia32_fixupimmpd512_mask:
-  case X86::BI__builtin_ia32_fixupimmpd512_maskz:
-  case X86::BI__builtin_ia32_fixupimmps512_mask:
-  case X86::BI__builtin_ia32_fixupimmps512_maskz:
-  case X86::BI__builtin_ia32_fixupimmsd_mask:
-  case X86::BI__builtin_ia32_fixupimmsd_maskz:
-  case X86::BI__builtin_ia32_fixupimmss_mask:
-  case X86::BI__builtin_ia32_fixupimmss_maskz:
-  case X86::BI__builtin_ia32_fixupimmpd128_mask:
-  case X86::BI__builtin_ia32_fixupimmpd128_maskz:
-  case X86::BI__builtin_ia32_fixupimmpd256_mask:
-  case X86::BI__builtin_ia32_fixupimmpd256_maskz:
-  case X86::BI__builtin_ia32_fixupimmps128_mask:
-  case X86::BI__builtin_ia32_fixupimmps128_maskz:
-  case X86::BI__builtin_ia32_fixupimmps256_mask:
-  case X86::BI__builtin_ia32_fixupimmps256_maskz:
-  case X86::BI__builtin_ia32_pternlogd512_mask:
-  case X86::BI__builtin_ia32_pternlogd512_maskz:
-  case X86::BI__builtin_ia32_pternlogq512_mask:
-  case X86::BI__builtin_ia32_pternlogq512_maskz:
-  case X86::BI__builtin_ia32_pternlogd128_mask:
-  case X86::BI__builtin_ia32_pternlogd128_maskz:
-  case X86::BI__builtin_ia32_pternlogd256_mask:
-  case X86::BI__builtin_ia32_pternlogd256_maskz:
-  case X86::BI__builtin_ia32_pternlogq128_mask:
-  case X86::BI__builtin_ia32_pternlogq128_maskz:
-  case X86::BI__builtin_ia32_pternlogq256_mask:
-  case X86::BI__builtin_ia32_pternlogq256_maskz:
-  case X86::BI__builtin_ia32_vsm3rnds2:
-    i = 3; l = 0; u = 255;
-    break;
-  case X86::BI__builtin_ia32_gatherpfdpd:
-  case X86::BI__builtin_ia32_gatherpfdps:
-  case X86::BI__builtin_ia32_gatherpfqpd:
-  case X86::BI__builtin_ia32_gatherpfqps:
-  case X86::BI__builtin_ia32_scatterpfdpd:
-  case X86::BI__builtin_ia32_scatterpfdps:
-  case X86::BI__builtin_ia32_scatterpfqpd:
-  case X86::BI__builtin_ia32_scatterpfqps:
-    i = 4; l = 2; u = 3;
-    break;
-  case X86::BI__builtin_ia32_reducesd_mask:
-  case X86::BI__builtin_ia32_reducess_mask:
-  case X86::BI__builtin_ia32_rndscalesd_round_mask:
-  case X86::BI__builtin_ia32_rndscaless_round_mask:
-  case X86::BI__builtin_ia32_rndscalesh_round_mask:
-  case X86::BI__builtin_ia32_reducesh_mask:
-    i = 4; l = 0; u = 255;
-    break;
-  case X86::BI__builtin_ia32_cmpccxadd32:
-  case X86::BI__builtin_ia32_cmpccxadd64:
-    i = 3; l = 0; u = 15;
-    break;
-  }
-
-  // Note that we don't force a hard error on the range check here, allowing
-  // template-generated or macro-generated dead code to potentially have out-of-
-  // range values. These need to code generate, but don't need to necessarily
-  // make any sense. We use a warning that defaults to an error.
-  return BuiltinConstantArgRange(TheCall, i, l, u, /*RangeIsError*/ false);
-}
-
 /// Given a FunctionDecl's FormatAttr, attempts to populate the FomatStringInfo
 /// parameter with the FormatAttr's correct format_idx and firstDataArg.
 /// Returns true when the format fits the function and the FormatStringInfo has
@@ -9302,7 +7584,7 @@ ExprResult Sema::BuiltinNontemporalOverloaded(ExprResult TheCallResult) {
   unsigned numArgs = isStore ? 2 : 1;
 
   // Ensure that we have the proper number of arguments.
-  if (checkArgCount(*this, TheCall, numArgs))
+  if (checkArgCount(TheCall, numArgs))
     return ExprError();
 
   // Inspect the last argument of the nontemporal builtin.  This should always
@@ -9467,7 +7749,7 @@ bool Sema::BuiltinVAStart(unsigned BuiltinID, CallExpr *TheCall) {
   // In C23 mode, va_start only needs one argument. However, the builtin still
   // requires two arguments (which matches the behavior of the GCC builtin),
   // <stdarg.h> passes `0` as the second argument in C23 mode.
-  if (checkArgCount(*this, TheCall, 2))
+  if (checkArgCount(TheCall, 2))
     return true;
 
   // Type-check the first argument normally.
@@ -9598,7 +7880,7 @@ bool Sema::BuiltinVAStartARMMicrosoft(CallExpr *Call) {
 /// BuiltinUnorderedCompare - Handle functions like __builtin_isgreater and
 /// friends.  This is declared to take (...), so we have to check everything.
 bool Sema::BuiltinUnorderedCompare(CallExpr *TheCall, unsigned BuiltinID) {
-  if (checkArgCount(*this, TheCall, 2))
+  if (checkArgCount(TheCall, 2))
     return true;
 
   if (BuiltinID == Builtin::BI__builtin_isunordered &&
@@ -9642,7 +7924,7 @@ bool Sema::BuiltinUnorderedCompare(CallExpr *TheCall, unsigned BuiltinID) {
 /// to check everything.
 bool Sema::BuiltinFPClassification(CallExpr *TheCall, unsigned NumArgs,
                                    unsigned BuiltinID) {
-  if (checkArgCount(*this, TheCall, NumArgs))
+  if (checkArgCount(TheCall, NumArgs))
     return true;
 
   FPOptions FPO = TheCall->getFPFeaturesInEffect(getLangOpts());
@@ -9727,7 +8009,7 @@ bool Sema::BuiltinFPClassification(CallExpr *TheCall, unsigned NumArgs,
 
 /// Perform semantic analysis for a call to __builtin_complex.
 bool Sema::BuiltinComplex(CallExpr *TheCall) {
-  if (checkArgCount(*this, TheCall, 2))
+  if (checkArgCount(TheCall, 2))
     return true;
 
   bool Dependent = false;
@@ -9789,7 +8071,7 @@ bool Sema::BuiltinComplex(CallExpr *TheCall) {
 // vector short vec_xxsldwi(vector short, vector short, int);
 bool Sema::BuiltinVSX(CallExpr *TheCall) {
   unsigned ExpectedNumArgs = 3;
-  if (checkArgCount(*this, TheCall, ExpectedNumArgs))
+  if (checkArgCount(TheCall, ExpectedNumArgs))
     return true;
 
   // Check the third argument is a compile time constant
@@ -9976,7 +8258,7 @@ bool Sema::BuiltinArithmeticFence(CallExpr *TheCall) {
   if (!Context.getTargetInfo().checkArithmeticFenceSupported())
     return Diag(TheCall->getBeginLoc(), diag::err_builtin_target_unsupported)
            << SourceRange(TheCall->getBeginLoc(), TheCall->getEndLoc());
-  if (checkArgCount(*this, TheCall, 1))
+  if (checkArgCount(TheCall, 1))
     return true;
   Expr *Arg = TheCall->getArg(0);
   if (Arg->isInstantiationDependent())
@@ -10046,7 +8328,7 @@ bool Sema::BuiltinAllocaWithAlign(CallExpr *TheCall) {
 /// Handle __builtin_assume_aligned. This is declared
 /// as (const void*, size_t, ...) and can take one optional constant int arg.
 bool Sema::BuiltinAssumeAligned(CallExpr *TheCall) {
-  if (checkArgCountRange(*this, TheCall, 2, 3))
+  if (checkArgCountRange(TheCall, 2, 3))
     return true;
 
   unsigned NumArgs = TheCall->getNumArgs();
@@ -10349,7 +8631,7 @@ bool Sema::BuiltinConstantArgShiftedByteOrXXFF(CallExpr *TheCall, int ArgNum,
 /// BuiltinARMMemoryTaggingCall - Handle calls of memory tagging extensions
 bool Sema::BuiltinARMMemoryTaggingCall(unsigned BuiltinID, CallExpr *TheCall) {
   if (BuiltinID == AArch64::BI__builtin_arm_irg) {
-    if (checkArgCount(*this, TheCall, 2))
+    if (checkArgCount(TheCall, 2))
       return true;
     Expr *Arg0 = TheCall->getArg(0);
     Expr *Arg1 = TheCall->getArg(1);
@@ -10377,7 +8659,7 @@ bool Sema::BuiltinARMMemoryTaggingCall(unsigned BuiltinID, CallExpr *TheCall) {
   }
 
   if (BuiltinID == AArch64::BI__builtin_arm_addg) {
-    if (checkArgCount(*this, TheCall, 2))
+    if (checkArgCount(TheCall, 2))
       return true;
 
     Expr *Arg0 = TheCall->getArg(0);
@@ -10398,7 +8680,7 @@ bool Sema::BuiltinARMMemoryTaggingCall(unsigned BuiltinID, CallExpr *TheCall) {
   }
 
   if (BuiltinID == AArch64::BI__builtin_arm_gmi) {
-    if (checkArgCount(*this, TheCall, 2))
+    if (checkArgCount(TheCall, 2))
       return true;
     Expr *Arg0 = TheCall->getArg(0);
     Expr *Arg1 = TheCall->getArg(1);
@@ -10421,7 +8703,7 @@ bool Sema::BuiltinARMMemoryTaggingCall(unsigned BuiltinID, CallExpr *TheCall) {
 
   if (BuiltinID == AArch64::BI__builtin_arm_ldg ||
       BuiltinID == AArch64::BI__builtin_arm_stg) {
-    if (checkArgCount(*this, TheCall, 1))
+    if (checkArgCount(TheCall, 1))
       return true;
     Expr *Arg0 = TheCall->getArg(0);
     ExprResult FirstArg = DefaultFunctionArrayLvalueConversion(Arg0);
@@ -10694,7 +8976,7 @@ bool Sema::BuiltinPPCMMACall(CallExpr *TheCall, unsigned BuiltinID,
     (void) DecodePPCMMATypeFromStr(Context, TypeStr, Mask);
     ArgNum++;
   }
-  if (checkArgCount(*this, TheCall, ArgNum))
+  if (checkArgCount(TheCall, ArgNum))
     return true;
 
   return false;
@@ -19706,7 +17988,7 @@ void Sema::CheckAddressOfPackedMember(Expr *rhs) {
 }
 
 bool Sema::PrepareBuiltinElementwiseMathOneArgCall(CallExpr *TheCall) {
-  if (checkArgCount(*this, TheCall, 1))
+  if (checkArgCount(TheCall, 1))
     return true;
 
   ExprResult A = UsualUnaryConversions(TheCall->getArg(0));
@@ -19745,7 +18027,7 @@ bool Sema::BuiltinVectorToScalarMath(CallExpr *TheCall) {
 }
 
 bool Sema::BuiltinVectorMath(CallExpr *TheCall, QualType &Res) {
-  if (checkArgCount(*this, TheCall, 2))
+  if (checkArgCount(TheCall, 2))
     return true;
 
   ExprResult A = TheCall->getArg(0);
@@ -19774,7 +18056,7 @@ bool Sema::BuiltinVectorMath(CallExpr *TheCall, QualType &Res) {
 
 bool Sema::BuiltinElementwiseTernaryMath(CallExpr *TheCall,
                                          bool CheckForFloatArgs) {
-  if (checkArgCount(*this, TheCall, 3))
+  if (checkArgCount(TheCall, 3))
     return true;
 
   Expr *Args[3];
@@ -19817,7 +18099,7 @@ bool Sema::BuiltinElementwiseTernaryMath(CallExpr *TheCall,
 }
 
 bool Sema::PrepareBuiltinReduceMathOneArgCall(CallExpr *TheCall) {
-  if (checkArgCount(*this, TheCall, 1))
+  if (checkArgCount(TheCall, 1))
     return true;
 
   ExprResult A = UsualUnaryConversions(TheCall->getArg(0));
@@ -19829,7 +18111,7 @@ bool Sema::PrepareBuiltinReduceMathOneArgCall(CallExpr *TheCall) {
 }
 
 bool Sema::BuiltinNonDeterministicValue(CallExpr *TheCall) {
-  if (checkArgCount(*this, TheCall, 1))
+  if (checkArgCount(TheCall, 1))
     return true;
 
   ExprResult Arg = TheCall->getArg(0);
@@ -19845,7 +18127,7 @@ bool Sema::BuiltinNonDeterministicValue(CallExpr *TheCall) {
 
 ExprResult Sema::BuiltinMatrixTranspose(CallExpr *TheCall,
                                         ExprResult CallResult) {
-  if (checkArgCount(*this, TheCall, 1))
+  if (checkArgCount(TheCall, 1))
     return ExprError();
 
   ExprResult MatrixArg = DefaultLvalueConversion(TheCall->getArg(0));
@@ -19900,7 +18182,7 @@ ExprResult Sema::BuiltinMatrixColumnMajorLoad(CallExpr *TheCall,
     return ExprError();
   }
 
-  if (checkArgCount(*this, TheCall, 4))
+  if (checkArgCount(TheCall, 4))
     return ExprError();
 
   unsigned PtrArgIdx = 0;
@@ -20011,7 +18293,7 @@ ExprResult Sema::BuiltinMatrixColumnMajorLoad(CallExpr *TheCall,
 
 ExprResult Sema::BuiltinMatrixColumnMajorStore(CallExpr *TheCall,
                                                ExprResult CallResult) {
-  if (checkArgCount(*this, TheCall, 3))
+  if (checkArgCount(TheCall, 3))
     return ExprError();
 
   unsigned PtrArgIdx = 1;
@@ -20137,7 +18419,7 @@ static bool CheckWasmBuiltinArgIsInteger(Sema &S, CallExpr *E,
 /// Check that the first argument is a WebAssembly table, and the second
 /// is an index to use as index into the table.
 bool Sema::BuiltinWasmTableGet(CallExpr *TheCall) {
-  if (checkArgCount(*this, TheCall, 2))
+  if (checkArgCount(TheCall, 2))
     return true;
 
   QualType ElTy;
@@ -20160,7 +18442,7 @@ bool Sema::BuiltinWasmTableGet(CallExpr *TheCall) {
 /// an index to use as index into the table and the third is the reference
 /// type to set into the table.
 bool Sema::BuiltinWasmTableSet(CallExpr *TheCall) {
-  if (checkArgCount(*this, TheCall, 3))
+  if (checkArgCount(TheCall, 3))
     return true;
 
   QualType ElTy;
@@ -20178,7 +18460,7 @@ bool Sema::BuiltinWasmTableSet(CallExpr *TheCall) {
 
 /// Check that the argument is a WebAssembly table.
 bool Sema::BuiltinWasmTableSize(CallExpr *TheCall) {
-  if (checkArgCount(*this, TheCall, 1))
+  if (checkArgCount(TheCall, 1))
     return true;
 
   QualType ElTy;
@@ -20192,7 +18474,7 @@ bool Sema::BuiltinWasmTableSize(CallExpr *TheCall) {
 /// value to use for new elements (of a type matching the table type), the
 /// third value is an integer.
 bool Sema::BuiltinWasmTableGrow(CallExpr *TheCall) {
-  if (checkArgCount(*this, TheCall, 3))
+  if (checkArgCount(TheCall, 3))
     return true;
 
   QualType ElTy;
@@ -20216,7 +18498,7 @@ bool Sema::BuiltinWasmTableGrow(CallExpr *TheCall) {
 /// integer, the third is the value to use to fill the table (of a type
 /// matching the table type), and the fourth is an integer.
 bool Sema::BuiltinWasmTableFill(CallExpr *TheCall) {
-  if (checkArgCount(*this, TheCall, 4))
+  if (checkArgCount(TheCall, 4))
     return true;
 
   QualType ElTy;
@@ -20243,7 +18525,7 @@ bool Sema::BuiltinWasmTableFill(CallExpr *TheCall) {
 /// WebAssembly table (of the same element type), and the third to fifth
 /// arguments are integers.
 bool Sema::BuiltinWasmTableCopy(CallExpr *TheCall) {
-  if (checkArgCount(*this, TheCall, 5))
+  if (checkArgCount(TheCall, 5))
     return true;
 
   QualType XElTy;
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 6764a979168d..2a87b26f17a2 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -50,6 +50,7 @@
 #include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/SemaObjC.h"
 #include "clang/Sema/SemaOpenMP.h"
+#include "clang/Sema/SemaRISCV.h"
 #include "clang/Sema/Template.h"
 #include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ADT/SmallString.h"
@@ -4985,7 +4986,7 @@ void Sema::setTagNameForLinkagePurposes(TagDecl *TagFromDeclSpec,
   if (TagFromDeclSpec->hasNameForLinkage())
     return;
 
-  // A well-formed anonymous tag must always be a TUK_Definition.
+  // A well-formed anonymous tag must always be a TagUseKind::Definition.
   assert(TagFromDeclSpec->isThisDeclarationADefinition());
 
   // The type must match the tag exactly;  no qualifiers allowed.
@@ -8926,8 +8927,8 @@ void Sema::CheckVariableDeclarationType(VarDecl *NewVD) {
     const FunctionDecl *FD = cast<FunctionDecl>(CurContext);
     llvm::StringMap<bool> CallerFeatureMap;
     Context.getFunctionFeatureMap(CallerFeatureMap, FD);
-    checkRVVTypeSupport(T, NewVD->getLocation(), cast<Decl>(CurContext),
-                        CallerFeatureMap);
+    RISCV().checkRVVTypeSupport(T, NewVD->getLocation(), cast<Decl>(CurContext),
+                                CallerFeatureMap);
   }
 }
 
@@ -11867,8 +11868,8 @@ static bool CheckMultiVersionFunction(Sema &S, FunctionDecl *NewFD,
     return false;
 
   if (!OldDecl || !OldDecl->getAsFunction() ||
-      OldDecl->getDeclContext()->getRedeclContext() !=
-          NewFD->getDeclContext()->getRedeclContext()) {
+      !OldDecl->getDeclContext()->getRedeclContext()->Equals(
+          NewFD->getDeclContext()->getRedeclContext())) {
     // If there's no previous declaration, AND this isn't attempting to cause
     // multiversioning, this isn't an error condition.
     if (MVKind == MultiVersionKind::None)
@@ -17238,9 +17239,9 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
                OffsetOfKind OOK, SkipBodyInfo *SkipBody) {
   // If this is not a definition, it must have a name.
   IdentifierInfo *OrigName = Name;
-  assert((Name != nullptr || TUK == TUK_Definition) &&
+  assert((Name != nullptr || TUK == TagUseKind::Definition) &&
          "Nameless record must be a definition!");
-  assert(TemplateParameterLists.size() == 0 || TUK != TUK_Reference);
+  assert(TemplateParameterLists.size() == 0 || TUK != TagUseKind::Reference);
 
   OwnedDecl = false;
   TagTypeKind Kind = TypeWithKeyword::getTagTypeKindForTypeSpec(TagSpec);
@@ -17254,11 +17255,11 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
   // or a scope specifier, which also conveniently avoids this work
   // for non-C++ cases.
   if (TemplateParameterLists.size() > 0 ||
-      (SS.isNotEmpty() && TUK != TUK_Reference)) {
+      (SS.isNotEmpty() && TUK != TagUseKind::Reference)) {
     TemplateParameterList *TemplateParams =
         MatchTemplateParametersToScopeSpecifier(
             KWLoc, NameLoc, SS, nullptr, TemplateParameterLists,
-            TUK == TUK_Friend, isMemberSpecialization, Invalid);
+            TUK == TagUseKind::Friend, isMemberSpecialization, Invalid);
 
     // C++23 [dcl.type.elab] p2:
     //   If an elaborated-type-specifier is the sole constituent of a
@@ -17273,7 +17274,8 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
     // FIXME: Class template partial specializations can be forward declared
     // per CWG2213, but the resolution failed to allow qualified forward
     // declarations. This is almost certainly unintentional, so we allow them.
-    if (TUK == TUK_Declaration && SS.isNotEmpty() && !isMemberSpecialization)
+    if (TUK == TagUseKind::Declaration && SS.isNotEmpty() &&
+        !isMemberSpecialization)
       Diag(SS.getBeginLoc(), diag::err_standalone_class_nested_name_specifier)
           << TypeWithKeyword::getTagTypeKindName(Kind) << SS.getRange();
 
@@ -17310,7 +17312,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
       return true;
   }
 
-  if (TUK == TUK_Friend && Kind == TagTypeKind::Enum) {
+  if (TUK == TagUseKind::Friend && Kind == TagTypeKind::Enum) {
     // C++23 [dcl.type.elab]p4:
     //   If an elaborated-type-specifier appears with the friend specifier as
     //   an entire member-declaration, the member-declaration shall have one
@@ -17361,7 +17363,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
       // of 'int'. However, if this is an unfixed forward declaration, don't set
       // the underlying type unless the user enables -fms-compatibility. This
       // makes unfixed forward declared enums incomplete and is more conforming.
-      if (TUK == TUK_Definition || getLangOpts().MSVCCompat)
+      if (TUK == TagUseKind::Definition || getLangOpts().MSVCCompat)
         EnumUnderlying = Context.IntTy.getTypePtr();
     }
   }
@@ -17372,7 +17374,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
   bool isStdAlignValT = false;
 
   RedeclarationKind Redecl = forRedeclarationInCurContext();
-  if (TUK == TUK_Friend || TUK == TUK_Reference)
+  if (TUK == TagUseKind::Friend || TUK == TagUseKind::Reference)
     Redecl = RedeclarationKind::NotForRedeclaration;
 
   /// Create a new tag decl in C/ObjC. Since the ODR-like semantics for ObjC/C
@@ -17390,7 +17392,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
       New = EnumDecl::Create(Context, SearchDC, KWLoc, Loc, Name, nullptr,
                              ScopedEnum, ScopedEnumUsesClassTag, IsFixed);
       // If this is an undefined enum, bail.
-      if (TUK != TUK_Definition && !Invalid)
+      if (TUK != TagUseKind::Definition && !Invalid)
         return nullptr;
       if (EnumUnderlying) {
         EnumDecl *ED = cast<EnumDecl>(New);
@@ -17418,7 +17420,8 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
       // many points during the parsing of a struct declaration (because
       // the #pragma tokens are effectively skipped over during the
       // parsing of the struct).
-      if (TUK == TUK_Definition && (!SkipBody || !SkipBody->ShouldSkip)) {
+      if (TUK == TagUseKind::Definition &&
+          (!SkipBody || !SkipBody->ShouldSkip)) {
         AddAlignmentAttributesForRecord(RD);
         AddMsStructLayoutForRecord(RD);
       }
@@ -17439,7 +17442,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
 
     // If this is a friend or a reference to a class in a dependent
     // context, don't try to make a decl for it.
-    if (TUK == TUK_Friend || TUK == TUK_Reference) {
+    if (TUK == TagUseKind::Friend || TUK == TagUseKind::Reference) {
       DC = computeDeclContext(SS, false);
       if (!DC) {
         IsDependent = true;
@@ -17472,7 +17475,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
       // this as a dependent elaborated-type-specifier.
       // But this only makes any sense for reference-like lookups.
       if (Previous.wasNotFoundInCurrentInstantiation() &&
-          (TUK == TUK_Reference || TUK == TUK_Friend)) {
+          (TUK == TagUseKind::Reference || TUK == TagUseKind::Friend)) {
         IsDependent = true;
         return true;
       }
@@ -17489,7 +17492,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
     //   If T is the name of a class, then each of the following shall have a
     //   name different from T:
     //    -- every member of class T that is itself a type
-    if (TUK != TUK_Reference && TUK != TUK_Friend &&
+    if (TUK != TagUseKind::Reference && TUK != TagUseKind::Friend &&
         DiagnoseClassNameShadow(SearchDC, DeclarationNameInfo(Name, NameLoc)))
       return true;
 
@@ -17503,7 +17506,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
     // When declaring or defining a tag, ignore ambiguities introduced
     // by types using'ed into this scope.
     if (Previous.isAmbiguous() &&
-        (TUK == TUK_Definition || TUK == TUK_Declaration)) {
+        (TUK == TagUseKind::Definition || TUK == TagUseKind::Declaration)) {
       LookupResult::Filter F = Previous.makeFilter();
       while (F.hasNext()) {
         NamedDecl *ND = F.next();
@@ -17527,7 +17530,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
     //
     // Does it matter that this should be by scope instead of by
     // semantic context?
-    if (!Previous.empty() && TUK == TUK_Friend) {
+    if (!Previous.empty() && TUK == TagUseKind::Friend) {
       DeclContext *EnclosingNS = SearchDC->getEnclosingNamespaceContext();
       LookupResult::Filter F = Previous.makeFilter();
       bool FriendSawTagOutsideEnclosingNamespace = false;
@@ -17557,7 +17560,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
     if (Previous.isAmbiguous())
       return true;
 
-    if (!getLangOpts().CPlusPlus && TUK != TUK_Reference) {
+    if (!getLangOpts().CPlusPlus && TUK != TagUseKind::Reference) {
       // FIXME: This makes sure that we ignore the contexts associated
       // with C structs, unions, and enums when looking for a matching
       // tag declaration or definition. See the similar lookup tweak
@@ -17609,11 +17612,12 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
   // also need to do a redeclaration lookup there, just in case
   // there's a shadow friend decl.
   if (Name && Previous.empty() &&
-      (TUK == TUK_Reference || TUK == TUK_Friend || IsTemplateParamOrArg)) {
+      (TUK == TagUseKind::Reference || TUK == TagUseKind::Friend ||
+       IsTemplateParamOrArg)) {
     if (Invalid) goto CreateNewDecl;
     assert(SS.isEmpty());
 
-    if (TUK == TUK_Reference || IsTemplateParamOrArg) {
+    if (TUK == TagUseKind::Reference || IsTemplateParamOrArg) {
       // C++ [basic.scope.pdecl]p5:
       //   -- for an elaborated-type-specifier of the form
       //
@@ -17647,7 +17651,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
       // Find the scope where we'll be declaring the tag.
       S = getTagInjectionScope(S, getLangOpts());
     } else {
-      assert(TUK == TUK_Friend);
+      assert(TUK == TagUseKind::Friend);
       CXXRecordDecl *RD = dyn_cast<CXXRecordDecl>(SearchDC);
 
       // C++ [namespace.memdef]p3:
@@ -17712,7 +17716,8 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
     // redefinition if either context is within the other.
     if (auto *Shadow = dyn_cast<UsingShadowDecl>(DirectPrevDecl)) {
       auto *OldTag = dyn_cast<TagDecl>(PrevDecl);
-      if (SS.isEmpty() && TUK != TUK_Reference && TUK != TUK_Friend &&
+      if (SS.isEmpty() && TUK != TagUseKind::Reference &&
+          TUK != TagUseKind::Friend &&
           isDeclInScope(Shadow, SearchDC, S, isMemberSpecialization) &&
           !(OldTag && isAcceptableTagRedeclContext(
                           *this, OldTag->getDeclContext(), SearchDC))) {
@@ -17731,13 +17736,13 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
       // If this is a use of a previous tag, or if the tag is already declared
       // in the same scope (so that the definition/declaration completes or
       // rementions the tag), reuse the decl.
-      if (TUK == TUK_Reference || TUK == TUK_Friend ||
+      if (TUK == TagUseKind::Reference || TUK == TagUseKind::Friend ||
           isDeclInScope(DirectPrevDecl, SearchDC, S,
                         SS.isNotEmpty() || isMemberSpecialization)) {
         // Make sure that this wasn't declared as an enum and now used as a
         // struct or something similar.
         if (!isAcceptableTagRedeclaration(PrevTagDecl, Kind,
-                                          TUK == TUK_Definition, KWLoc,
+                                          TUK == TagUseKind::Definition, KWLoc,
                                           Name)) {
           bool SafeToContinue =
               (PrevTagDecl->getTagKind() != TagTypeKind::Enum &&
@@ -17764,7 +17769,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
         if (Kind == TagTypeKind::Enum &&
             PrevTagDecl->getTagKind() == TagTypeKind::Enum) {
           const EnumDecl *PrevEnum = cast<EnumDecl>(PrevTagDecl);
-          if (TUK == TUK_Reference || TUK == TUK_Friend)
+          if (TUK == TagUseKind::Reference || TUK == TagUseKind::Friend)
             return PrevTagDecl;
 
           QualType EnumUnderlyingTy;
@@ -17779,14 +17784,14 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
           if (CheckEnumRedeclaration(NameLoc.isValid() ? NameLoc : KWLoc,
                                      ScopedEnum, EnumUnderlyingTy,
                                      IsFixed, PrevEnum))
-            return TUK == TUK_Declaration ? PrevTagDecl : nullptr;
+            return TUK == TagUseKind::Declaration ? PrevTagDecl : nullptr;
         }
 
         // C++11 [class.mem]p1:
         //   A member shall not be declared twice in the member-specification,
         //   except that a nested class or member class template can be declared
         //   and then later defined.
-        if (TUK == TUK_Declaration && PrevDecl->isCXXClassMember() &&
+        if (TUK == TagUseKind::Declaration && PrevDecl->isCXXClassMember() &&
             S->isDeclScope(PrevDecl)) {
           Diag(NameLoc, diag::ext_member_redeclared);
           Diag(PrevTagDecl->getLocation(), diag::note_previous_declaration);
@@ -17795,11 +17800,11 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
         if (!Invalid) {
           // If this is a use, just return the declaration we found, unless
           // we have attributes.
-          if (TUK == TUK_Reference || TUK == TUK_Friend) {
+          if (TUK == TagUseKind::Reference || TUK == TagUseKind::Friend) {
             if (!Attrs.empty()) {
               // FIXME: Diagnose these attributes. For now, we create a new
               // declaration to hold them.
-            } else if (TUK == TUK_Reference &&
+            } else if (TUK == TagUseKind::Reference &&
                        (PrevTagDecl->getFriendObjectKind() ==
                             Decl::FOK_Undeclared ||
                         PrevDecl->getOwningModule() != getCurrentModule()) &&
@@ -17823,7 +17828,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
           }
 
           // Diagnose attempts to redefine a tag.
-          if (TUK == TUK_Definition) {
+          if (TUK == TagUseKind::Definition) {
             if (NamedDecl *Def = PrevTagDecl->getDefinition()) {
               // If we're defining a specialization and the previous definition
               // is from an implicit instantiation, don't emit an error
@@ -17903,7 +17908,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
           // Okay, we're going to make a redeclaration.  If this is some kind
           // of reference, make sure we build the redeclaration in the same DC
           // as the original, and ignore the current access specifier.
-          if (TUK == TUK_Friend || TUK == TUK_Reference) {
+          if (TUK == TagUseKind::Friend || TUK == TagUseKind::Reference) {
             SearchDC = PrevTagDecl->getDeclContext();
             AS = AS_none;
           }
@@ -17929,7 +17934,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
       // Use a better diagnostic if an elaborated-type-specifier
       // found the wrong kind of type on the first
       // (non-redeclaration) lookup.
-      if ((TUK == TUK_Reference || TUK == TUK_Friend) &&
+      if ((TUK == TagUseKind::Reference || TUK == TagUseKind::Friend) &&
           !Previous.isForRedeclaration()) {
         NonTagKind NTK = getNonTagTypeDeclKind(PrevDecl, Kind);
         Diag(NameLoc, diag::err_tag_reference_non_tag)
@@ -17943,7 +17948,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
         // do nothing
 
       // Diagnose implicit declarations introduced by elaborated types.
-      } else if (TUK == TUK_Reference || TUK == TUK_Friend) {
+      } else if (TUK == TagUseKind::Reference || TUK == TagUseKind::Friend) {
         NonTagKind NTK = getNonTagTypeDeclKind(PrevDecl, Kind);
         Diag(NameLoc, diag::err_tag_reference_conflict) << NTK;
         Diag(PrevDecl->getLocation(), diag::note_previous_decl) << PrevDecl;
@@ -18002,7 +18007,7 @@ CreateNewDecl:
       StdAlignValT = cast<EnumDecl>(New);
 
     // If this is an undefined enum, warn.
-    if (TUK != TUK_Definition && !Invalid) {
+    if (TUK != TagUseKind::Definition && !Invalid) {
       TagDecl *Def;
       if (IsFixed && cast<EnumDecl>(New)->isFixed()) {
         // C++0x: 7.2p2: opaque-enum-declaration.
@@ -18052,21 +18057,22 @@ CreateNewDecl:
   }
 
   // Only C23 and later allow defining new types in 'offsetof()'.
-  if (OOK != OOK_Outside && TUK == TUK_Definition && !getLangOpts().CPlusPlus &&
-      !getLangOpts().C23)
+  if (OOK != OOK_Outside && TUK == TagUseKind::Definition &&
+      !getLangOpts().CPlusPlus && !getLangOpts().C23)
     Diag(New->getLocation(), diag::ext_type_defined_in_offsetof)
         << (OOK == OOK_Macro) << New->getSourceRange();
 
   // C++11 [dcl.type]p3:
   //   A type-specifier-seq shall not define a class or enumeration [...].
   if (!Invalid && getLangOpts().CPlusPlus &&
-      (IsTypeSpecifier || IsTemplateParamOrArg) && TUK == TUK_Definition) {
+      (IsTypeSpecifier || IsTemplateParamOrArg) &&
+      TUK == TagUseKind::Definition) {
     Diag(New->getLocation(), diag::err_type_defined_in_type_specifier)
       << Context.getTagDeclType(New);
     Invalid = true;
   }
 
-  if (!Invalid && getLangOpts().CPlusPlus && TUK == TUK_Definition &&
+  if (!Invalid && getLangOpts().CPlusPlus && TUK == TagUseKind::Definition &&
       DC->getDeclKind() == Decl::Enum) {
     Diag(New->getLocation(), diag::err_type_defined_in_enum)
       << Context.getTagDeclType(New);
@@ -18078,7 +18084,7 @@ CreateNewDecl:
     if (SS.isSet()) {
       // If this is either a declaration or a definition, check the
       // nested-name-specifier against the current context.
-      if ((TUK == TUK_Definition || TUK == TUK_Declaration) &&
+      if ((TUK == TagUseKind::Definition || TUK == TagUseKind::Declaration) &&
           diagnoseQualifiedDeclaration(SS, DC, OrigName, Loc,
                                        /*TemplateId=*/nullptr,
                                        isMemberSpecialization))
@@ -18103,7 +18109,7 @@ CreateNewDecl:
     // many points during the parsing of a struct declaration (because
     // the #pragma tokens are effectively skipped over during the
     // parsing of the struct).
-    if (TUK == TUK_Definition && (!SkipBody || !SkipBody->ShouldSkip)) {
+    if (TUK == TagUseKind::Definition && (!SkipBody || !SkipBody->ShouldSkip)) {
       AddAlignmentAttributesForRecord(RD);
       AddMsStructLayoutForRecord(RD);
     }
@@ -18134,7 +18140,7 @@ CreateNewDecl:
     if (getLangOpts().CPlusPlus) {
       // C++ [dcl.fct]p6:
       //   Types shall not be defined in return or parameter types.
-      if (TUK == TUK_Definition && !IsTypeSpecifier) {
+      if (TUK == TagUseKind::Definition && !IsTypeSpecifier) {
         Diag(Loc, diag::err_type_defined_in_param_type)
             << Name;
         Invalid = true;
@@ -18155,7 +18161,7 @@ CreateNewDecl:
   // In Microsoft mode, a friend declaration also acts as a forward
   // declaration so we always pass true to setObjectOfFriendDecl to make
   // the tag name visible.
-  if (TUK == TUK_Friend)
+  if (TUK == TagUseKind::Friend)
     New->setObjectOfFriendDecl(getLangOpts().MSVCCompat);
 
   // Set the access specifier.
@@ -18165,14 +18171,14 @@ CreateNewDecl:
   if (PrevDecl)
     CheckRedeclarationInModule(New, PrevDecl);
 
-  if (TUK == TUK_Definition && (!SkipBody || !SkipBody->ShouldSkip))
+  if (TUK == TagUseKind::Definition && (!SkipBody || !SkipBody->ShouldSkip))
     New->startDefinition();
 
   ProcessDeclAttributeList(S, New, Attrs);
   AddPragmaAttributes(S, New);
 
   // If this has an identifier, add it to the scope stack.
-  if (TUK == TUK_Friend) {
+  if (TUK == TagUseKind::Friend) {
     // We might be replacing an existing declaration in the lookup tables;
     // if so, borrow its access specifier.
     if (PrevDecl)
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index ca5938083917..5041fd65286f 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8663,31 +8663,95 @@ static const RecordDecl *GetEnclosingNamedOrTopAnonRecord(const FieldDecl *FD) {
   return RD;
 }
 
-static bool
-CheckCountExpr(Sema &S, FieldDecl *FD, Expr *E,
-               llvm::SmallVectorImpl<TypeCoupledDeclRefInfo> &Decls) {
+enum class CountedByInvalidPointeeTypeKind {
+  INCOMPLETE,
+  SIZELESS,
+  FUNCTION,
+  FLEXIBLE_ARRAY_MEMBER,
+  VALID,
+};
+
+static bool CheckCountedByAttrOnField(
+    Sema &S, FieldDecl *FD, Expr *E,
+    llvm::SmallVectorImpl<TypeCoupledDeclRefInfo> &Decls) {
+  // Check the context the attribute is used in
+
   if (FD->getParent()->isUnion()) {
     S.Diag(FD->getBeginLoc(), diag::err_counted_by_attr_in_union)
         << FD->getSourceRange();
     return true;
   }
 
-  if (!E->getType()->isIntegerType() || E->getType()->isBooleanType()) {
-    S.Diag(E->getBeginLoc(), diag::err_counted_by_attr_argument_not_integer)
-        << E->getSourceRange();
+  const auto FieldTy = FD->getType();
+  if (!FieldTy->isArrayType() && !FieldTy->isPointerType()) {
+    S.Diag(FD->getBeginLoc(),
+           diag::err_counted_by_attr_not_on_ptr_or_flexible_array_member)
+        << FD->getLocation();
     return true;
   }
 
   LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel =
       LangOptions::StrictFlexArraysLevelKind::IncompleteOnly;
-
-  if (!Decl::isFlexibleArrayMemberLike(S.getASTContext(), FD, FD->getType(),
+  if (FieldTy->isArrayType() &&
+      !Decl::isFlexibleArrayMemberLike(S.getASTContext(), FD, FieldTy,
                                        StrictFlexArraysLevel, true)) {
-    // The "counted_by" attribute must be on a flexible array member.
-    SourceRange SR = FD->getLocation();
-    S.Diag(SR.getBegin(),
-           diag::err_counted_by_attr_not_on_flexible_array_member)
-        << SR;
+    S.Diag(FD->getBeginLoc(),
+           diag::err_counted_by_attr_on_array_not_flexible_array_member)
+        << FD->getLocation();
+    return true;
+  }
+
+  CountedByInvalidPointeeTypeKind InvalidTypeKind =
+      CountedByInvalidPointeeTypeKind::VALID;
+  QualType PointeeTy;
+  int SelectPtrOrArr = 0;
+  if (FieldTy->isPointerType()) {
+    PointeeTy = FieldTy->getPointeeType();
+    SelectPtrOrArr = 0;
+  } else {
+    assert(FieldTy->isArrayType());
+    const ArrayType *AT = S.getASTContext().getAsArrayType(FieldTy);
+    PointeeTy = AT->getElementType();
+    SelectPtrOrArr = 1;
+  }
+  // Note: The `Decl::isFlexibleArrayMemberLike` check earlier on means
+  // only `PointeeTy->isStructureTypeWithFlexibleArrayMember()` is reachable
+  // when `FieldTy->isArrayType()`.
+  bool ShouldWarn = false;
+  if (PointeeTy->isIncompleteType()) {
+    InvalidTypeKind = CountedByInvalidPointeeTypeKind::INCOMPLETE;
+  } else if (PointeeTy->isSizelessType()) {
+    InvalidTypeKind = CountedByInvalidPointeeTypeKind::SIZELESS;
+  } else if (PointeeTy->isFunctionType()) {
+    InvalidTypeKind = CountedByInvalidPointeeTypeKind::FUNCTION;
+  } else if (PointeeTy->isStructureTypeWithFlexibleArrayMember()) {
+    if (FieldTy->isArrayType()) {
+      // This is a workaround for the Linux kernel that has already adopted
+      // `counted_by` on a FAM where the pointee is a struct with a FAM. This
+      // should be an error because computing the bounds of the array cannot be
+      // done correctly without manually traversing every struct object in the
+      // array at runtime. To allow the code to be built this error is
+      // downgraded to a warning.
+      ShouldWarn = true;
+    }
+    InvalidTypeKind = CountedByInvalidPointeeTypeKind::FLEXIBLE_ARRAY_MEMBER;
+  }
+
+  if (InvalidTypeKind != CountedByInvalidPointeeTypeKind::VALID) {
+    unsigned DiagID = ShouldWarn
+                          ? diag::warn_counted_by_attr_elt_type_unknown_size
+                          : diag::err_counted_by_attr_pointee_unknown_size;
+    S.Diag(FD->getBeginLoc(), DiagID)
+        << SelectPtrOrArr << PointeeTy << (int)InvalidTypeKind
+        << (ShouldWarn ? 1 : 0) << FD->getSourceRange();
+    return true;
+  }
+
+  // Check the expression
+
+  if (!E->getType()->isIntegerType() || E->getType()->isBooleanType()) {
+    S.Diag(E->getBeginLoc(), diag::err_counted_by_attr_argument_not_integer)
+        << E->getSourceRange();
     return true;
   }
 
@@ -8750,10 +8814,11 @@ static void handleCountedByAttrField(Sema &S, Decl *D, const ParsedAttr &AL) {
     return;
 
   llvm::SmallVector<TypeCoupledDeclRefInfo, 1> Decls;
-  if (CheckCountExpr(S, FD, CountExpr, Decls))
+  if (CheckCountedByAttrOnField(S, FD, CountExpr, Decls))
     return;
 
-  QualType CAT = S.BuildCountAttributedArrayType(FD->getType(), CountExpr);
+  QualType CAT =
+      S.BuildCountAttributedArrayOrPointerType(FD->getType(), CountExpr);
   FD->setType(CAT);
 }
 
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 104e27139fe4..8ab429e2a136 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -17580,11 +17580,12 @@ DeclResult Sema::ActOnTemplatedFriendTag(
       if (Invalid)
         return true;
 
-      return CheckClassTemplate(S, TagSpec, TUK_Friend, TagLoc, SS, Name,
-                                NameLoc, Attr, TemplateParams, AS_public,
+      return CheckClassTemplate(S, TagSpec, TagUseKind::Friend, TagLoc, SS,
+                                Name, NameLoc, Attr, TemplateParams, AS_public,
                                 /*ModulePrivateLoc=*/SourceLocation(),
                                 FriendLoc, TempParamLists.size() - 1,
-                                TempParamLists.data()).get();
+                                TempParamLists.data())
+          .get();
     } else {
       // The "template<>" header is extraneous.
       Diag(TemplateParams->getTemplateLoc(), diag::err_template_tag_noparams)
@@ -17612,8 +17613,8 @@ DeclResult Sema::ActOnTemplatedFriendTag(
     if (SS.isEmpty()) {
       bool Owned = false;
       bool IsDependent = false;
-      return ActOnTag(S, TagSpec, TUK_Friend, TagLoc, SS, Name, NameLoc, Attr,
-                      AS_public,
+      return ActOnTag(S, TagSpec, TagUseKind::Friend, TagLoc, SS, Name, NameLoc,
+                      Attr, AS_public,
                       /*ModulePrivateLoc=*/SourceLocation(),
                       MultiTemplateParamsArg(), Owned, IsDependent,
                       /*ScopedEnumKWLoc=*/SourceLocation(),
@@ -17728,7 +17729,7 @@ Decl *Sema::ActOnFriendTypeDecl(Scope *S, const DeclSpec &DS,
 
   // Try to convert the decl specifier to a type.  This works for
   // friend templates because ActOnTag never produces a ClassTemplateDecl
-  // for a TUK_Friend.
+  // for a TagUseKind::Friend.
   Declarator TheDeclarator(DS, ParsedAttributesView::none(),
                            DeclaratorContext::Member);
   TypeSourceInfo *TSI = GetTypeForDeclarator(TheDeclarator);
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 7bb34fd7a479..ff9c5ead36dc 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -5185,7 +5185,7 @@ Sema::CreateBuiltinArraySubscriptExpr(Expr *Base, SourceLocation LLoc,
   }
 
   // Perform default conversions.
-  if (!LHSExp->getType()->getAs<VectorType>()) {
+  if (!LHSExp->getType()->isSubscriptableVectorType()) {
     ExprResult Result = DefaultFunctionArrayLvalueConversion(LHSExp);
     if (Result.isInvalid())
       return ExprError();
@@ -5241,36 +5241,22 @@ Sema::CreateBuiltinArraySubscriptExpr(Expr *Base, SourceLocation LLoc,
         << ResultType << BaseExpr->getSourceRange();
       return ExprError();
     }
-  } else if (const VectorType *VTy = LHSTy->getAs<VectorType>()) {
-    BaseExpr = LHSExp;    // vectors: V[123]
-    IndexExpr = RHSExp;
-    // We apply C++ DR1213 to vector subscripting too.
-    if (getLangOpts().CPlusPlus11 && LHSExp->isPRValue()) {
-      ExprResult Materialized = TemporaryMaterializationConversion(LHSExp);
-      if (Materialized.isInvalid())
-        return ExprError();
-      LHSExp = Materialized.get();
+  } else if (LHSTy->isSubscriptableVectorType()) {
+    if (LHSTy->isBuiltinType() &&
+        LHSTy->getAs<BuiltinType>()->isSveVLSBuiltinType()) {
+      const BuiltinType *BTy = LHSTy->getAs<BuiltinType>();
+      if (BTy->isSVEBool())
+        return ExprError(Diag(LLoc, diag::err_subscript_svbool_t)
+                         << LHSExp->getSourceRange()
+                         << RHSExp->getSourceRange());
+      ResultType = BTy->getSveEltType(Context);
+    } else {
+      const VectorType *VTy = LHSTy->getAs<VectorType>();
+      ResultType = VTy->getElementType();
     }
-    VK = LHSExp->getValueKind();
-    if (VK != VK_PRValue)
-      OK = OK_VectorComponent;
-
-    ResultType = VTy->getElementType();
-    QualType BaseType = BaseExpr->getType();
-    Qualifiers BaseQuals = BaseType.getQualifiers();
-    Qualifiers MemberQuals = ResultType.getQualifiers();
-    Qualifiers Combined = BaseQuals + MemberQuals;
-    if (Combined != MemberQuals)
-      ResultType = Context.getQualifiedType(ResultType, Combined);
-  } else if (LHSTy->isBuiltinType() &&
-             LHSTy->getAs<BuiltinType>()->isSveVLSBuiltinType()) {
-    const BuiltinType *BTy = LHSTy->getAs<BuiltinType>();
-    if (BTy->isSVEBool())
-      return ExprError(Diag(LLoc, diag::err_subscript_svbool_t)
-                       << LHSExp->getSourceRange() << RHSExp->getSourceRange());
-
-    BaseExpr = LHSExp;
+    BaseExpr = LHSExp; // vectors: V[123]
     IndexExpr = RHSExp;
+    // We apply C++ DR1213 to vector subscripting too.
     if (getLangOpts().CPlusPlus11 && LHSExp->isPRValue()) {
       ExprResult Materialized = TemporaryMaterializationConversion(LHSExp);
       if (Materialized.isInvalid())
@@ -5281,8 +5267,6 @@ Sema::CreateBuiltinArraySubscriptExpr(Expr *Base, SourceLocation LLoc,
     if (VK != VK_PRValue)
       OK = OK_VectorComponent;
 
-    ResultType = BTy->getSveEltType(Context);
-
     QualType BaseType = BaseExpr->getType();
     Qualifiers BaseQuals = BaseType.getQualifiers();
     Qualifiers MemberQuals = ResultType.getQualifiers();
@@ -5579,10 +5563,9 @@ ExprResult Sema::BuildCXXDefaultArgExpr(SourceLocation CallLoc,
         Res = Immediate.TransformInitializer(Param->getInit(),
                                              /*NotCopy=*/false);
       });
-      if (Res.isInvalid())
-        return ExprError();
-      Res = ConvertParamDefaultArgument(Param, Res.get(),
-                                        Res.get()->getBeginLoc());
+      if (Res.isUsable())
+        Res = ConvertParamDefaultArgument(Param, Res.get(),
+                                          Res.get()->getBeginLoc());
       if (Res.isInvalid())
         return ExprError();
       Init = Res.get();
@@ -5616,9 +5599,10 @@ ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) {
     InitializationContext.emplace(Loc, Field, CurContext);
 
   Expr *Init = nullptr;
+  bool HasRewrittenInit = false;
 
   bool NestedDefaultChecking = isCheckingDefaultArgumentOrInitializer();
-
+  bool InLifetimeExtendingContext = isInLifetimeExtendingContext();
   EnterExpressionEvaluationContext EvalContext(
       *this, ExpressionEvaluationContext::PotentiallyEvaluated, Field);
 
@@ -5653,19 +5637,36 @@ ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) {
   ImmediateCallVisitor V(getASTContext());
   if (!NestedDefaultChecking)
     V.TraverseDecl(Field);
-  if (V.HasImmediateCalls) {
+
+  // CWG1815
+  // Support lifetime extension of temporary created by aggregate
+  // initialization using a default member initializer. We should always rebuild
+  // the initializer if it contains any temporaries (if the initializer
+  // expression is an ExprWithCleanups). Then make sure the normal lifetime
+  // extension code recurses into the default initializer and does lifetime
+  // extension when warranted.
+  bool ContainsAnyTemporaries =
+      isa_and_present<ExprWithCleanups>(Field->getInClassInitializer());
+  if (V.HasImmediateCalls || InLifetimeExtendingContext ||
+      ContainsAnyTemporaries) {
+    HasRewrittenInit = true;
     ExprEvalContexts.back().DelayedDefaultInitializationContext = {Loc, Field,
                                                                    CurContext};
     ExprEvalContexts.back().IsCurrentlyCheckingDefaultArgumentOrInitializer =
         NestedDefaultChecking;
-
+    // Pass down lifetime extending flag, and collect temporaries in
+    // CreateMaterializeTemporaryExpr when we rewrite the call argument.
+    keepInLifetimeExtendingContext();
     EnsureImmediateInvocationInDefaultArgs Immediate(*this);
     ExprResult Res;
+
+    // Rebuild CXXDefaultInitExpr might cause diagnostics.
+    SFINAETrap Trap(*this);
     runWithSufficientStackSpace(Loc, [&] {
       Res = Immediate.TransformInitializer(Field->getInClassInitializer(),
                                            /*CXXDirectInit=*/false);
     });
-    if (!Res.isInvalid())
+    if (Res.isUsable())
       Res = ConvertMemberDefaultInitExpression(Field, Res.get(), Loc);
     if (Res.isInvalid()) {
       Field->setInvalidDecl();
@@ -5692,7 +5693,7 @@ ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) {
 
     return CXXDefaultInitExpr::Create(Context, InitializationContext->Loc,
                                       Field, InitializationContext->Context,
-                                      Init);
+                                      HasRewrittenInit ? Init : nullptr);
   }
 
   // DR1351:
@@ -7543,27 +7544,6 @@ bool Sema::isValidSveBitcast(QualType srcTy, QualType destTy) {
          ValidScalableConversion(destTy, srcTy);
 }
 
-/// Are the two types RVV-bitcast-compatible types? I.e. is bitcasting from the
-/// first RVV type (e.g. an RVV scalable type) to the second type (e.g. an RVV
-/// VLS type) allowed?
-///
-/// This will also return false if the two given types do not make sense from
-/// the perspective of RVV bitcasts.
-bool Sema::isValidRVVBitcast(QualType srcTy, QualType destTy) {
-  assert(srcTy->isVectorType() || destTy->isVectorType());
-
-  auto ValidScalableConversion = [](QualType FirstType, QualType SecondType) {
-    if (!FirstType->isRVVSizelessBuiltinType())
-      return false;
-
-    const auto *VecTy = SecondType->getAs<VectorType>();
-    return VecTy && VecTy->getVectorKind() == VectorKind::RVVFixedLengthData;
-  };
-
-  return ValidScalableConversion(srcTy, destTy) ||
-         ValidScalableConversion(destTy, srcTy);
-}
-
 /// Are the two types matrix types and do they have the same dimensions i.e.
 /// do they have the same number of rows and the same number of columns?
 bool Sema::areMatrixTypesOfTheSameDimension(QualType srcTy, QualType destTy) {
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index f543e006060d..d3e9dcb4f439 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -1554,9 +1554,6 @@ Sema::BuildCXXTypeConstructExpr(TypeSourceInfo *TInfo,
                                 bool ListInitialization) {
   QualType Ty = TInfo->getType();
   SourceLocation TyBeginLoc = TInfo->getTypeLoc().getBeginLoc();
-
-  assert((!ListInitialization || Exprs.size() == 1) &&
-         "List initialization must have exactly one expression.");
   SourceRange FullRange = SourceRange(TyBeginLoc, RParenOrBraceLoc);
 
   InitializedEntity Entity =
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 353e911c5cc3..79bdc8e9f878 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -8066,11 +8066,6 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path,
 enum PathLifetimeKind {
   /// Lifetime-extend along this path.
   Extend,
-  /// We should lifetime-extend, but we don't because (due to technical
-  /// limitations) we can't. This happens for default member initializers,
-  /// which we don't clone for every use, so we don't have a unique
-  /// MaterializeTemporaryExpr to update.
-  ShouldExtend,
   /// Do not lifetime extend along this path.
   NoExtend
 };
@@ -8082,7 +8077,7 @@ shouldLifetimeExtendThroughPath(const IndirectLocalPath &Path) {
   PathLifetimeKind Kind = PathLifetimeKind::Extend;
   for (auto Elem : Path) {
     if (Elem.Kind == IndirectLocalPathEntry::DefaultInit)
-      Kind = PathLifetimeKind::ShouldExtend;
+      Kind = PathLifetimeKind::Extend;
     else if (Elem.Kind != IndirectLocalPathEntry::LambdaCaptureInit)
       return PathLifetimeKind::NoExtend;
   }
@@ -8202,18 +8197,6 @@ void Sema::checkInitializerLifetime(const InitializedEntity &Entity,
                               ExtendingEntity->allocateManglingNumber());
         // Also visit the temporaries lifetime-extended by this initializer.
         return true;
-
-      case PathLifetimeKind::ShouldExtend:
-        // We're supposed to lifetime-extend the temporary along this path (per
-        // the resolution of DR1815), but we don't support that yet.
-        //
-        // FIXME: Properly handle this situation. Perhaps the easiest approach
-        // would be to clone the initializer expression on each use that would
-        // lifetime extend its temporaries.
-        Diag(DiagLoc, diag::warn_unsupported_lifetime_extension)
-            << RK << DiagRange;
-        break;
-
       case PathLifetimeKind::NoExtend:
         // If the path goes through the initialization of a variable or field,
         // it can't possibly reach a temporary created in this full-expression.
diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp
index 1743afaf1528..276a43ad79b9 100644
--- a/clang/lib/Sema/SemaLambda.cpp
+++ b/clang/lib/Sema/SemaLambda.cpp
@@ -12,6 +12,7 @@
 #include "clang/Sema/SemaLambda.h"
 #include "TypeLocBuilder.h"
 #include "clang/AST/ASTLambda.h"
+#include "clang/AST/CXXInheritance.h"
 #include "clang/AST/ExprCXX.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Sema/DeclSpec.h"
@@ -386,30 +387,69 @@ buildTypeForLambdaCallOperator(Sema &S, clang::CXXRecordDecl *Class,
 //  parameter, if any, of the lambda's function call operator (possibly
 //  instantiated from a function call operator template) shall be either:
 //  - the closure type,
-//  - class type derived from the closure type, or
+//  - class type publicly and unambiguously derived from the closure type, or
 //  - a reference to a possibly cv-qualified such type.
-void Sema::DiagnoseInvalidExplicitObjectParameterInLambda(
-    CXXMethodDecl *Method) {
+bool Sema::DiagnoseInvalidExplicitObjectParameterInLambda(
+    CXXMethodDecl *Method, SourceLocation CallLoc) {
   if (!isLambdaCallWithExplicitObjectParameter(Method))
-    return;
+    return false;
   CXXRecordDecl *RD = Method->getParent();
   if (Method->getType()->isDependentType())
-    return;
+    return false;
   if (RD->isCapturelessLambda())
-    return;
-  QualType ExplicitObjectParameterType = Method->getParamDecl(0)
-                                             ->getType()
+    return false;
+
+  ParmVarDecl *Param = Method->getParamDecl(0);
+  QualType ExplicitObjectParameterType = Param->getType()
                                              .getNonReferenceType()
                                              .getUnqualifiedType()
                                              .getDesugaredType(getASTContext());
   QualType LambdaType = getASTContext().getRecordType(RD);
   if (LambdaType == ExplicitObjectParameterType)
-    return;
-  if (IsDerivedFrom(RD->getLocation(), ExplicitObjectParameterType, LambdaType))
-    return;
-  Diag(Method->getParamDecl(0)->getLocation(),
-       diag::err_invalid_explicit_object_type_in_lambda)
-      << ExplicitObjectParameterType;
+    return false;
+
+  // Don't check the same instantiation twice.
+  //
+  // If this call operator is ill-formed, there is no point in issuing
+  // a diagnostic every time it is called because the problem is in the
+  // definition of the derived type, not at the call site.
+  //
+  // FIXME: Move this check to where we instantiate the method? This should
+  // be possible, but the naive approach of just marking the method as invalid
+  // leads to us emitting more diagnostics than we should have to for this case
+  // (1 error here *and* 1 error about there being no matching overload at the
+  // call site). It might be possible to avoid that by also checking if there
+  // is an empty cast path for the method stored in the context (signalling that
+  // we've already diagnosed it) and then just not building the call, but that
+  // doesn't really seem any simpler than diagnosing it at the call site...
+  if (auto It = Context.LambdaCastPaths.find(Method);
+      It != Context.LambdaCastPaths.end())
+    return It->second.empty();
+
+  CXXCastPath &Path = Context.LambdaCastPaths[Method];
+  CXXBasePaths Paths(/*FindAmbiguities=*/true, /*RecordPaths=*/true,
+                     /*DetectVirtual=*/false);
+  if (!IsDerivedFrom(RD->getLocation(), ExplicitObjectParameterType, LambdaType,
+                     Paths)) {
+    Diag(Param->getLocation(), diag::err_invalid_explicit_object_type_in_lambda)
+        << ExplicitObjectParameterType;
+    return true;
+  }
+
+  if (Paths.isAmbiguous(LambdaType->getCanonicalTypeUnqualified())) {
+    std::string PathsDisplay = getAmbiguousPathsDisplayString(Paths);
+    Diag(CallLoc, diag::err_explicit_object_lambda_ambiguous_base)
+        << LambdaType << PathsDisplay;
+    return true;
+  }
+
+  if (CheckBaseClassAccess(CallLoc, LambdaType, ExplicitObjectParameterType,
+                           Paths.front(),
+                           diag::err_explicit_object_lambda_inaccessible_base))
+    return true;
+
+  BuildBasePathArray(Paths, Path);
+  return false;
 }
 
 void Sema::handleLambdaNumbering(
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index e4d4cd7395eb..ef0a655b631a 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -34,6 +34,7 @@
 #include "clang/Sema/ScopeInfo.h"
 #include "clang/Sema/Sema.h"
 #include "clang/Sema/SemaInternal.h"
+#include "clang/Sema/SemaRISCV.h"
 #include "clang/Sema/TemplateDeduction.h"
 #include "clang/Sema/TypoCorrection.h"
 #include "llvm/ADT/STLExtras.h"
@@ -945,13 +946,13 @@ bool Sema::LookupBuiltin(LookupResult &R) {
         }
       }
 
-      if (DeclareRISCVVBuiltins || DeclareRISCVSiFiveVectorBuiltins) {
-        if (!RVIntrinsicManager)
-          RVIntrinsicManager = CreateRISCVIntrinsicManager(*this);
+      if (RISCV().DeclareRVVBuiltins || RISCV().DeclareSiFiveVectorBuiltins) {
+        if (!RISCV().IntrinsicManager)
+          RISCV().IntrinsicManager = CreateRISCVIntrinsicManager(*this);
 
-        RVIntrinsicManager->InitIntrinsicList();
+        RISCV().IntrinsicManager->InitIntrinsicList();
 
-        if (RVIntrinsicManager->CreateIntrinsicIfFound(R, II, PP))
+        if (RISCV().IntrinsicManager->CreateIntrinsicIfFound(R, II, PP))
           return true;
       }
 
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 6110e5229b07..bab61e8fd54e 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -9815,6 +9815,25 @@ static Stmt *buildPreInits(ASTContext &Context,
   return nullptr;
 }
 
+/// Append the \p Item or the content of a CompoundStmt to the list \p
+/// TargetList.
+///
+/// A CompoundStmt is used as container in case multiple statements need to be
+/// stored in lieu of using an explicit list. Flattening is necessary because
+/// contained DeclStmts need to be visible after the execution of the list. Used
+/// for OpenMP pre-init declarations/statements.
+static void appendFlattendedStmtList(SmallVectorImpl<Stmt *> &TargetList,
+                                     Stmt *Item) {
+  // nullptr represents an empty list.
+  if (!Item)
+    return;
+
+  if (auto *CS = dyn_cast<CompoundStmt>(Item))
+    llvm::append_range(TargetList, CS->body());
+  else
+    TargetList.push_back(Item);
+}
+
 /// Build preinits statement for the given declarations.
 static Stmt *
 buildPreInits(ASTContext &Context,
@@ -9828,6 +9847,17 @@ buildPreInits(ASTContext &Context,
   return nullptr;
 }
 
+/// Build pre-init statement for the given statements.
+static Stmt *buildPreInits(ASTContext &Context, ArrayRef<Stmt *> PreInits) {
+  if (PreInits.empty())
+    return nullptr;
+
+  SmallVector<Stmt *> Stmts;
+  for (Stmt *S : PreInits)
+    appendFlattendedStmtList(Stmts, S);
+  return CompoundStmt::Create(Context, PreInits, FPOptionsOverride(), {}, {});
+}
+
 /// Build postupdate expression for the given list of postupdates expressions.
 static Expr *buildPostUpdate(Sema &S, ArrayRef<Expr *> PostUpdates) {
   Expr *PostUpdate = nullptr;
@@ -9924,11 +9954,21 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
             Stmt *DependentPreInits = Transform->getPreInits();
             if (!DependentPreInits)
               return;
-            for (Decl *C : cast<DeclStmt>(DependentPreInits)->getDeclGroup()) {
-              auto *D = cast<VarDecl>(C);
-              DeclRefExpr *Ref = buildDeclRefExpr(SemaRef, D, D->getType(),
-                                                  Transform->getBeginLoc());
-              Captures[Ref] = Ref;
+
+            // Search for pre-init declared variables that need to be captured
+            // to be referenceable inside the directive.
+            SmallVector<Stmt *> Constituents;
+            appendFlattendedStmtList(Constituents, DependentPreInits);
+            for (Stmt *S : Constituents) {
+              if (auto *DC = dyn_cast<DeclStmt>(S)) {
+                for (Decl *C : DC->decls()) {
+                  auto *D = cast<VarDecl>(C);
+                  DeclRefExpr *Ref = buildDeclRefExpr(
+                      SemaRef, D, D->getType().getNonReferenceType(),
+                      Transform->getBeginLoc());
+                  Captures[Ref] = Ref;
+                }
+              }
             }
           }))
     return 0;
@@ -15059,9 +15099,7 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeSimdDirective(
 bool SemaOpenMP::checkTransformableLoopNest(
     OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops,
     SmallVectorImpl<OMPLoopBasedDirective::HelperExprs> &LoopHelpers,
-    Stmt *&Body,
-    SmallVectorImpl<SmallVector<llvm::PointerUnion<Stmt *, Decl *>, 0>>
-        &OriginalInits) {
+    Stmt *&Body, SmallVectorImpl<SmallVector<Stmt *, 0>> &OriginalInits) {
   OriginalInits.emplace_back();
   bool Result = OMPLoopBasedDirective::doForAllLoops(
       AStmt->IgnoreContainers(), /*TryImperfectlyNestedLoops=*/false, NumLoops,
@@ -15095,16 +15133,70 @@ bool SemaOpenMP::checkTransformableLoopNest(
           DependentPreInits = Dir->getPreInits();
         else
           llvm_unreachable("Unhandled loop transformation");
-        if (!DependentPreInits)
-          return;
-        llvm::append_range(OriginalInits.back(),
-                           cast<DeclStmt>(DependentPreInits)->getDeclGroup());
+
+        appendFlattendedStmtList(OriginalInits.back(), DependentPreInits);
       });
   assert(OriginalInits.back().empty() && "No preinit after innermost loop");
   OriginalInits.pop_back();
   return Result;
 }
 
+/// Add preinit statements that need to be propageted from the selected loop.
+static void addLoopPreInits(ASTContext &Context,
+                            OMPLoopBasedDirective::HelperExprs &LoopHelper,
+                            Stmt *LoopStmt, ArrayRef<Stmt *> OriginalInit,
+                            SmallVectorImpl<Stmt *> &PreInits) {
+
+  // For range-based for-statements, ensure that their syntactic sugar is
+  // executed by adding them as pre-init statements.
+  if (auto *CXXRangeFor = dyn_cast<CXXForRangeStmt>(LoopStmt)) {
+    Stmt *RangeInit = CXXRangeFor->getInit();
+    if (RangeInit)
+      PreInits.push_back(RangeInit);
+
+    DeclStmt *RangeStmt = CXXRangeFor->getRangeStmt();
+    PreInits.push_back(new (Context) DeclStmt(RangeStmt->getDeclGroup(),
+                                              RangeStmt->getBeginLoc(),
+                                              RangeStmt->getEndLoc()));
+
+    DeclStmt *RangeEnd = CXXRangeFor->getEndStmt();
+    PreInits.push_back(new (Context) DeclStmt(RangeEnd->getDeclGroup(),
+                                              RangeEnd->getBeginLoc(),
+                                              RangeEnd->getEndLoc()));
+  }
+
+  llvm::append_range(PreInits, OriginalInit);
+
+  // List of OMPCapturedExprDecl, for __begin, __end, and NumIterations
+  if (auto *PI = cast_or_null<DeclStmt>(LoopHelper.PreInits)) {
+    PreInits.push_back(new (Context) DeclStmt(
+        PI->getDeclGroup(), PI->getBeginLoc(), PI->getEndLoc()));
+  }
+
+  // Gather declarations for the data members used as counters.
+  for (Expr *CounterRef : LoopHelper.Counters) {
+    auto *CounterDecl = cast<DeclRefExpr>(CounterRef)->getDecl();
+    if (isa<OMPCapturedExprDecl>(CounterDecl))
+      PreInits.push_back(new (Context) DeclStmt(
+          DeclGroupRef(CounterDecl), SourceLocation(), SourceLocation()));
+  }
+}
+
+/// Collect the loop statements (ForStmt or CXXRangeForStmt) of the affected
+/// loop of a construct.
+static void collectLoopStmts(Stmt *AStmt, MutableArrayRef<Stmt *> LoopStmts) {
+  size_t NumLoops = LoopStmts.size();
+  OMPLoopBasedDirective::doForAllLoops(
+      AStmt, /*TryImperfectlyNestedLoops=*/false, NumLoops,
+      [LoopStmts](unsigned Cnt, Stmt *CurStmt) {
+        assert(!LoopStmts[Cnt] && "Loop statement must not yet be assigned");
+        LoopStmts[Cnt] = CurStmt;
+        return false;
+      });
+  assert(!is_contained(LoopStmts, nullptr) &&
+         "Expecting a loop statement for each affected loop");
+}
+
 StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
                                                 Stmt *AStmt,
                                                 SourceLocation StartLoc,
@@ -15126,8 +15218,7 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
   // Verify and diagnose loop nest.
   SmallVector<OMPLoopBasedDirective::HelperExprs, 4> LoopHelpers(NumLoops);
   Stmt *Body = nullptr;
-  SmallVector<SmallVector<llvm::PointerUnion<Stmt *, Decl *>, 0>, 4>
-      OriginalInits;
+  SmallVector<SmallVector<Stmt *, 0>, 4> OriginalInits;
   if (!checkTransformableLoopNest(OMPD_tile, AStmt, NumLoops, LoopHelpers, Body,
                                   OriginalInits))
     return StmtError();
@@ -15144,7 +15235,11 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
          "Expecting loop iteration space dimensionality to match number of "
          "affected loops");
 
-  SmallVector<Decl *, 4> PreInits;
+  // Collect all affected loop statements.
+  SmallVector<Stmt *> LoopStmts(NumLoops, nullptr);
+  collectLoopStmts(AStmt, LoopStmts);
+
+  SmallVector<Stmt *, 4> PreInits;
   CaptureVars CopyTransformer(SemaRef);
 
   // Create iteration variables for the generated loops.
@@ -15184,20 +15279,9 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
           &SemaRef.PP.getIdentifierTable().get(TileCntName));
       TileIndVars[I] = TileCntDecl;
     }
-    for (auto &P : OriginalInits[I]) {
-      if (auto *D = P.dyn_cast<Decl *>())
-        PreInits.push_back(D);
-      else if (auto *PI = dyn_cast_or_null<DeclStmt>(P.dyn_cast<Stmt *>()))
-        PreInits.append(PI->decl_begin(), PI->decl_end());
-    }
-    if (auto *PI = cast_or_null<DeclStmt>(LoopHelper.PreInits))
-      PreInits.append(PI->decl_begin(), PI->decl_end());
-    // Gather declarations for the data members used as counters.
-    for (Expr *CounterRef : LoopHelper.Counters) {
-      auto *CounterDecl = cast<DeclRefExpr>(CounterRef)->getDecl();
-      if (isa<OMPCapturedExprDecl>(CounterDecl))
-        PreInits.push_back(CounterDecl);
-    }
+
+    addLoopPreInits(Context, LoopHelper, LoopStmts[I], OriginalInits[I],
+                    PreInits);
   }
 
   // Once the original iteration values are set, append the innermost body.
@@ -15246,19 +15330,20 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
     OMPLoopBasedDirective::HelperExprs &LoopHelper = LoopHelpers[I];
     Expr *NumIterations = LoopHelper.NumIterations;
     auto *OrigCntVar = cast<DeclRefExpr>(LoopHelper.Counters[0]);
-    QualType CntTy = OrigCntVar->getType();
+    QualType IVTy = NumIterations->getType();
+    Stmt *LoopStmt = LoopStmts[I];
 
     // Commonly used variables. One of the constraints of an AST is that every
     // node object must appear at most once, hence we define lamdas that create
     // a new AST node at every use.
-    auto MakeTileIVRef = [&SemaRef = this->SemaRef, &TileIndVars, I, CntTy,
+    auto MakeTileIVRef = [&SemaRef = this->SemaRef, &TileIndVars, I, IVTy,
                           OrigCntVar]() {
-      return buildDeclRefExpr(SemaRef, TileIndVars[I], CntTy,
+      return buildDeclRefExpr(SemaRef, TileIndVars[I], IVTy,
                               OrigCntVar->getExprLoc());
     };
-    auto MakeFloorIVRef = [&SemaRef = this->SemaRef, &FloorIndVars, I, CntTy,
+    auto MakeFloorIVRef = [&SemaRef = this->SemaRef, &FloorIndVars, I, IVTy,
                            OrigCntVar]() {
-      return buildDeclRefExpr(SemaRef, FloorIndVars[I], CntTy,
+      return buildDeclRefExpr(SemaRef, FloorIndVars[I], IVTy,
                               OrigCntVar->getExprLoc());
     };
 
@@ -15320,6 +15405,8 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
     // further into the inner loop.
     SmallVector<Stmt *, 4> BodyParts;
     BodyParts.append(LoopHelper.Updates.begin(), LoopHelper.Updates.end());
+    if (auto *SourceCXXFor = dyn_cast<CXXForRangeStmt>(LoopStmt))
+      BodyParts.push_back(SourceCXXFor->getLoopVarStmt());
     BodyParts.push_back(Inner);
     Inner = CompoundStmt::Create(Context, BodyParts, FPOptionsOverride(),
                                  Inner->getBeginLoc(), Inner->getEndLoc());
@@ -15334,12 +15421,14 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
     auto &LoopHelper = LoopHelpers[I];
     Expr *NumIterations = LoopHelper.NumIterations;
     DeclRefExpr *OrigCntVar = cast<DeclRefExpr>(LoopHelper.Counters[0]);
-    QualType CntTy = OrigCntVar->getType();
+    QualType IVTy = NumIterations->getType();
 
-    // Commonly used variables.
-    auto MakeFloorIVRef = [&SemaRef = this->SemaRef, &FloorIndVars, I, CntTy,
+    // Commonly used variables. One of the constraints of an AST is that every
+    // node object must appear at most once, hence we define lamdas that create
+    // a new AST node at every use.
+    auto MakeFloorIVRef = [&SemaRef = this->SemaRef, &FloorIndVars, I, IVTy,
                            OrigCntVar]() {
-      return buildDeclRefExpr(SemaRef, FloorIndVars[I], CntTy,
+      return buildDeclRefExpr(SemaRef, FloorIndVars[I], IVTy,
                               OrigCntVar->getExprLoc());
     };
 
@@ -15405,8 +15494,7 @@ StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
   Stmt *Body = nullptr;
   SmallVector<OMPLoopBasedDirective::HelperExprs, NumLoops> LoopHelpers(
       NumLoops);
-  SmallVector<SmallVector<llvm::PointerUnion<Stmt *, Decl *>, 0>, NumLoops + 1>
-      OriginalInits;
+  SmallVector<SmallVector<Stmt *, 0>, NumLoops + 1> OriginalInits;
   if (!checkTransformableLoopNest(OMPD_unroll, AStmt, NumLoops, LoopHelpers,
                                   Body, OriginalInits))
     return StmtError();
@@ -15418,6 +15506,10 @@ StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
     return OMPUnrollDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
                                       NumGeneratedLoops, nullptr, nullptr);
 
+  assert(LoopHelpers.size() == NumLoops &&
+         "Expecting a single-dimensional loop iteration space");
+  assert(OriginalInits.size() == NumLoops &&
+         "Expecting a single-dimensional loop iteration space");
   OMPLoopBasedDirective::HelperExprs &LoopHelper = LoopHelpers.front();
 
   if (FullClause) {
@@ -15481,24 +15573,13 @@ StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
   // of a canonical loop nest where these PreInits are emitted before the
   // outermost directive.
 
+  // Find the loop statement.
+  Stmt *LoopStmt = nullptr;
+  collectLoopStmts(AStmt, {LoopStmt});
+
   // Determine the PreInit declarations.
-  SmallVector<Decl *, 4> PreInits;
-  assert(OriginalInits.size() == 1 &&
-         "Expecting a single-dimensional loop iteration space");
-  for (auto &P : OriginalInits[0]) {
-    if (auto *D = P.dyn_cast<Decl *>())
-      PreInits.push_back(D);
-    else if (auto *PI = dyn_cast_or_null<DeclStmt>(P.dyn_cast<Stmt *>()))
-      PreInits.append(PI->decl_begin(), PI->decl_end());
-  }
-  if (auto *PI = cast_or_null<DeclStmt>(LoopHelper.PreInits))
-    PreInits.append(PI->decl_begin(), PI->decl_end());
-  // Gather declarations for the data members used as counters.
-  for (Expr *CounterRef : LoopHelper.Counters) {
-    auto *CounterDecl = cast<DeclRefExpr>(CounterRef)->getDecl();
-    if (isa<OMPCapturedExprDecl>(CounterDecl))
-      PreInits.push_back(CounterDecl);
-  }
+  SmallVector<Stmt *, 4> PreInits;
+  addLoopPreInits(Context, LoopHelper, LoopStmt, OriginalInits[0], PreInits);
 
   auto *IterationVarRef = cast<DeclRefExpr>(LoopHelper.IterationVarRef);
   QualType IVTy = IterationVarRef->getType();
@@ -15604,6 +15685,8 @@ StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
   // Inner For statement.
   SmallVector<Stmt *> InnerBodyStmts;
   InnerBodyStmts.append(LoopHelper.Updates.begin(), LoopHelper.Updates.end());
+  if (auto *CXXRangeFor = dyn_cast<CXXForRangeStmt>(LoopStmt))
+    InnerBodyStmts.push_back(CXXRangeFor->getLoopVarStmt());
   InnerBodyStmts.push_back(Body);
   CompoundStmt *InnerBody =
       CompoundStmt::Create(getASTContext(), InnerBodyStmts, FPOptionsOverride(),
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 2eb25237a0de..0c89fca8d38e 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -6472,17 +6472,20 @@ ExprResult Sema::InitializeExplicitObjectArgument(Sema &S, Expr *Obj,
       Obj->getExprLoc(), Obj);
 }
 
-static void PrepareExplicitObjectArgument(Sema &S, CXXMethodDecl *Method,
+static bool PrepareExplicitObjectArgument(Sema &S, CXXMethodDecl *Method,
                                           Expr *Object, MultiExprArg &Args,
                                           SmallVectorImpl<Expr *> &NewArgs) {
   assert(Method->isExplicitObjectMemberFunction() &&
          "Method is not an explicit member function");
   assert(NewArgs.empty() && "NewArgs should be empty");
+
   NewArgs.reserve(Args.size() + 1);
   Expr *This = GetExplicitObjectExpr(S, Object, Method);
   NewArgs.push_back(This);
   NewArgs.append(Args.begin(), Args.end());
   Args = NewArgs;
+  return S.DiagnoseInvalidExplicitObjectParameterInLambda(
+      Method, Object->getBeginLoc());
 }
 
 /// Determine whether the provided type is an integral type, or an enumeration
@@ -15612,8 +15615,10 @@ ExprResult Sema::BuildCallToMemberFunction(Scope *S, Expr *MemExprE,
   CallExpr *TheCall = nullptr;
   llvm::SmallVector<Expr *, 8> NewArgs;
   if (Method->isExplicitObjectMemberFunction()) {
-    PrepareExplicitObjectArgument(*this, Method, MemExpr->getBase(), Args,
-                                  NewArgs);
+    if (PrepareExplicitObjectArgument(*this, Method, MemExpr->getBase(), Args,
+                                      NewArgs))
+      return ExprError();
+
     // Build the actual expression node.
     ExprResult FnExpr =
         CreateFunctionRefExpr(*this, Method, FoundDecl, MemExpr,
@@ -15927,9 +15932,7 @@ Sema::BuildCallToObjectOfClassType(Scope *S, Expr *Obj,
   // Initialize the object parameter.
   llvm::SmallVector<Expr *, 8> NewArgs;
   if (Method->isExplicitObjectMemberFunction()) {
-    // FIXME: we should do that during the definition of the lambda when we can.
-    DiagnoseInvalidExplicitObjectParameterInLambda(Method);
-    PrepareExplicitObjectArgument(*this, Method, Obj, Args, NewArgs);
+    IsError |= PrepareExplicitObjectArgument(*this, Method, Obj, Args, NewArgs);
   } else {
     ExprResult ObjRes = PerformImplicitObjectArgumentInitialization(
         Object.get(), /*Qualifier=*/nullptr, Best->FoundDecl, Method);
diff --git a/clang/lib/Sema/SemaRISCV.cpp b/clang/lib/Sema/SemaRISCV.cpp
index 26e13e87b1d6..ea6e3f75490b 100644
--- a/clang/lib/Sema/SemaRISCV.cpp
+++ b/clang/lib/Sema/SemaRISCV.cpp
@@ -1,4 +1,4 @@
-//==- SemaRISCVVectorLookup.cpp - Name Lookup for RISC-V Vector Intrinsic -==//
+//===------ SemaRISCV.cpp ------- RISC-V target-specific routines ---------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,20 +6,24 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This file implements name lookup for RISC-V vector intrinsic.
+//  This file implements semantic analysis functions specific to RISC-V.
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/Sema/SemaRISCV.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
 #include "clang/Basic/Builtins.h"
+#include "clang/Basic/TargetBuiltins.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Lex/Preprocessor.h"
+#include "clang/Sema/Initialization.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Sema/RISCVIntrinsicManager.h"
 #include "clang/Sema/Sema.h"
 #include "clang/Support/RISCVVIntrinsicUtils.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/TargetParser/RISCVTargetParser.h"
 #include <optional>
 #include <string>
 #include <vector>
@@ -166,7 +170,6 @@ private:
   // Mapping function name to RVVOverloadIntrinsicDef.
   StringMap<RVVOverloadIntrinsicDef> OverloadIntrinsics;
 
-
   // Create RVVIntrinsicDef.
   void InitRVVIntrinsic(const RVVIntrinsicRecord &Record, StringRef SuffixStr,
                         StringRef OverloadedSuffixStr, bool IsMask,
@@ -342,18 +345,17 @@ void RISCVIntrinsicManagerImpl::ConstructRVVIntrinsics(
                            /*IsMask=*/true, *PolicyTypes, MaskedHasPolicy, P);
         }
       } // End for different LMUL
-    }   // End for different TypeRange
+    } // End for different TypeRange
   }
 }
 
 void RISCVIntrinsicManagerImpl::InitIntrinsicList() {
 
-  if (S.DeclareRISCVVBuiltins && !ConstructedRISCVVBuiltins) {
+  if (S.RISCV().DeclareRVVBuiltins && !ConstructedRISCVVBuiltins) {
     ConstructedRISCVVBuiltins = true;
-    ConstructRVVIntrinsics(RVVIntrinsicRecords,
-                           IntrinsicKind::RVV);
+    ConstructRVVIntrinsics(RVVIntrinsicRecords, IntrinsicKind::RVV);
   }
-  if (S.DeclareRISCVSiFiveVectorBuiltins &&
+  if (S.RISCV().DeclareSiFiveVectorBuiltins &&
       !ConstructedRISCVSiFiveVectorBuiltins) {
     ConstructedRISCVSiFiveVectorBuiltins = true;
     ConstructRVVIntrinsics(RVSiFiveVectorIntrinsicRecords,
@@ -501,4 +503,925 @@ std::unique_ptr<clang::sema::RISCVIntrinsicManager>
 CreateRISCVIntrinsicManager(Sema &S) {
   return std::make_unique<RISCVIntrinsicManagerImpl>(S);
 }
+
+bool SemaRISCV::CheckLMUL(CallExpr *TheCall, unsigned ArgNum) {
+  llvm::APSInt Result;
+
+  // We can't check the value of a dependent argument.
+  Expr *Arg = TheCall->getArg(ArgNum);
+  if (Arg->isTypeDependent() || Arg->isValueDependent())
+    return false;
+
+  // Check constant-ness first.
+  if (SemaRef.BuiltinConstantArg(TheCall, ArgNum, Result))
+    return true;
+
+  int64_t Val = Result.getSExtValue();
+  if ((Val >= 0 && Val <= 3) || (Val >= 5 && Val <= 7))
+    return false;
+
+  return Diag(TheCall->getBeginLoc(), diag::err_riscv_builtin_invalid_lmul)
+         << Arg->getSourceRange();
+}
+
+static bool CheckInvalidVLENandLMUL(const TargetInfo &TI, CallExpr *TheCall,
+                                    Sema &S, QualType Type, int EGW) {
+  assert((EGW == 128 || EGW == 256) && "EGW can only be 128 or 256 bits");
+
+  // LMUL * VLEN >= EGW
+  ASTContext::BuiltinVectorTypeInfo Info =
+      S.Context.getBuiltinVectorTypeInfo(Type->castAs<BuiltinType>());
+  unsigned ElemSize = S.Context.getTypeSize(Info.ElementType);
+  unsigned MinElemCount = Info.EC.getKnownMinValue();
+
+  unsigned EGS = EGW / ElemSize;
+  // If EGS is less than or equal to the minimum number of elements, then the
+  // type is valid.
+  if (EGS <= MinElemCount)
+    return false;
+
+  // Otherwise, we need vscale to be at least EGS / MinElemCont.
+  assert(EGS % MinElemCount == 0);
+  unsigned VScaleFactor = EGS / MinElemCount;
+  // Vscale is VLEN/RVVBitsPerBlock.
+  unsigned MinRequiredVLEN = VScaleFactor * llvm::RISCV::RVVBitsPerBlock;
+  std::string RequiredExt = "zvl" + std::to_string(MinRequiredVLEN) + "b";
+  if (!TI.hasFeature(RequiredExt))
+    return S.Diag(TheCall->getBeginLoc(),
+                  diag::err_riscv_type_requires_extension)
+           << Type << RequiredExt;
+
+  return false;
+}
+
+bool SemaRISCV::CheckBuiltinFunctionCall(const TargetInfo &TI,
+                                         unsigned BuiltinID,
+                                         CallExpr *TheCall) {
+  ASTContext &Context = getASTContext();
+  // vmulh.vv, vmulh.vx, vmulhu.vv, vmulhu.vx, vmulhsu.vv, vmulhsu.vx,
+  // vsmul.vv, vsmul.vx are not included for EEW=64 in Zve64*.
+  switch (BuiltinID) {
+  default:
+    break;
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vv:
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vx:
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vx_tu:
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vv_m:
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vx_m:
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vv_mu:
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vx_mu:
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vv_tum:
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vx_tum:
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vv_tumu:
+  case RISCVVector::BI__builtin_rvv_vmulhsu_vx_tumu:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vv:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vx:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vx_tu:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vv_m:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vx_m:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vv_mu:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vx_mu:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vv_tum:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vx_tum:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vv_tumu:
+  case RISCVVector::BI__builtin_rvv_vmulhu_vx_tumu:
+  case RISCVVector::BI__builtin_rvv_vmulh_vv:
+  case RISCVVector::BI__builtin_rvv_vmulh_vx:
+  case RISCVVector::BI__builtin_rvv_vmulh_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vmulh_vx_tu:
+  case RISCVVector::BI__builtin_rvv_vmulh_vv_m:
+  case RISCVVector::BI__builtin_rvv_vmulh_vx_m:
+  case RISCVVector::BI__builtin_rvv_vmulh_vv_mu:
+  case RISCVVector::BI__builtin_rvv_vmulh_vx_mu:
+  case RISCVVector::BI__builtin_rvv_vmulh_vv_tum:
+  case RISCVVector::BI__builtin_rvv_vmulh_vx_tum:
+  case RISCVVector::BI__builtin_rvv_vmulh_vv_tumu:
+  case RISCVVector::BI__builtin_rvv_vmulh_vx_tumu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx_tu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv_m:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx_m:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv_mu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx_mu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv_tum:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx_tum:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv_tumu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx_tumu: {
+    ASTContext::BuiltinVectorTypeInfo Info = Context.getBuiltinVectorTypeInfo(
+        TheCall->getType()->castAs<BuiltinType>());
+
+    if (Context.getTypeSize(Info.ElementType) == 64 && !TI.hasFeature("v"))
+      return Diag(TheCall->getBeginLoc(),
+                  diag::err_riscv_builtin_requires_extension)
+             << /* IsExtension */ true << TheCall->getSourceRange() << "v";
+
+    break;
+  }
+  }
+
+  switch (BuiltinID) {
+  case RISCVVector::BI__builtin_rvv_vsetvli:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 3) ||
+           CheckLMUL(TheCall, 2);
+  case RISCVVector::BI__builtin_rvv_vsetvlimax:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
+           CheckLMUL(TheCall, 1);
+  case RISCVVector::BI__builtin_rvv_vget_v: {
+    ASTContext::BuiltinVectorTypeInfo ResVecInfo =
+        Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(
+            TheCall->getType().getCanonicalType().getTypePtr()));
+    ASTContext::BuiltinVectorTypeInfo VecInfo =
+        Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(
+            TheCall->getArg(0)->getType().getCanonicalType().getTypePtr()));
+    unsigned MaxIndex;
+    if (VecInfo.NumVectors != 1) // vget for tuple type
+      MaxIndex = VecInfo.NumVectors;
+    else // vget for non-tuple type
+      MaxIndex = (VecInfo.EC.getKnownMinValue() * VecInfo.NumVectors) /
+                 (ResVecInfo.EC.getKnownMinValue() * ResVecInfo.NumVectors);
+    return SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, MaxIndex - 1);
+  }
+  case RISCVVector::BI__builtin_rvv_vset_v: {
+    ASTContext::BuiltinVectorTypeInfo ResVecInfo =
+        Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(
+            TheCall->getType().getCanonicalType().getTypePtr()));
+    ASTContext::BuiltinVectorTypeInfo VecInfo =
+        Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(
+            TheCall->getArg(2)->getType().getCanonicalType().getTypePtr()));
+    unsigned MaxIndex;
+    if (ResVecInfo.NumVectors != 1) // vset for tuple type
+      MaxIndex = ResVecInfo.NumVectors;
+    else // vset fo non-tuple type
+      MaxIndex = (ResVecInfo.EC.getKnownMinValue() * ResVecInfo.NumVectors) /
+                 (VecInfo.EC.getKnownMinValue() * VecInfo.NumVectors);
+    return SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, MaxIndex - 1);
+  }
+  // Vector Crypto
+  case RISCVVector::BI__builtin_rvv_vaeskf1_vi_tu:
+  case RISCVVector::BI__builtin_rvv_vaeskf2_vi_tu:
+  case RISCVVector::BI__builtin_rvv_vaeskf2_vi:
+  case RISCVVector::BI__builtin_rvv_vsm4k_vi_tu: {
+    QualType Op1Type = TheCall->getArg(0)->getType();
+    QualType Op2Type = TheCall->getArg(1)->getType();
+    return CheckInvalidVLENandLMUL(TI, TheCall, SemaRef, Op1Type, 128) ||
+           CheckInvalidVLENandLMUL(TI, TheCall, SemaRef, Op2Type, 128) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 31);
+  }
+  case RISCVVector::BI__builtin_rvv_vsm3c_vi_tu:
+  case RISCVVector::BI__builtin_rvv_vsm3c_vi: {
+    QualType Op1Type = TheCall->getArg(0)->getType();
+    return CheckInvalidVLENandLMUL(TI, TheCall, SemaRef, Op1Type, 256) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 31);
+  }
+  case RISCVVector::BI__builtin_rvv_vaeskf1_vi:
+  case RISCVVector::BI__builtin_rvv_vsm4k_vi: {
+    QualType Op1Type = TheCall->getArg(0)->getType();
+    return CheckInvalidVLENandLMUL(TI, TheCall, SemaRef, Op1Type, 128) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 31);
+  }
+  case RISCVVector::BI__builtin_rvv_vaesdf_vv:
+  case RISCVVector::BI__builtin_rvv_vaesdf_vs:
+  case RISCVVector::BI__builtin_rvv_vaesdm_vv:
+  case RISCVVector::BI__builtin_rvv_vaesdm_vs:
+  case RISCVVector::BI__builtin_rvv_vaesef_vv:
+  case RISCVVector::BI__builtin_rvv_vaesef_vs:
+  case RISCVVector::BI__builtin_rvv_vaesem_vv:
+  case RISCVVector::BI__builtin_rvv_vaesem_vs:
+  case RISCVVector::BI__builtin_rvv_vaesz_vs:
+  case RISCVVector::BI__builtin_rvv_vsm4r_vv:
+  case RISCVVector::BI__builtin_rvv_vsm4r_vs:
+  case RISCVVector::BI__builtin_rvv_vaesdf_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vaesdf_vs_tu:
+  case RISCVVector::BI__builtin_rvv_vaesdm_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vaesdm_vs_tu:
+  case RISCVVector::BI__builtin_rvv_vaesef_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vaesef_vs_tu:
+  case RISCVVector::BI__builtin_rvv_vaesem_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vaesem_vs_tu:
+  case RISCVVector::BI__builtin_rvv_vaesz_vs_tu:
+  case RISCVVector::BI__builtin_rvv_vsm4r_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vsm4r_vs_tu: {
+    QualType Op1Type = TheCall->getArg(0)->getType();
+    QualType Op2Type = TheCall->getArg(1)->getType();
+    return CheckInvalidVLENandLMUL(TI, TheCall, SemaRef, Op1Type, 128) ||
+           CheckInvalidVLENandLMUL(TI, TheCall, SemaRef, Op2Type, 128);
+  }
+  case RISCVVector::BI__builtin_rvv_vsha2ch_vv:
+  case RISCVVector::BI__builtin_rvv_vsha2cl_vv:
+  case RISCVVector::BI__builtin_rvv_vsha2ms_vv:
+  case RISCVVector::BI__builtin_rvv_vsha2ch_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vsha2cl_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vsha2ms_vv_tu: {
+    QualType Op1Type = TheCall->getArg(0)->getType();
+    QualType Op2Type = TheCall->getArg(1)->getType();
+    QualType Op3Type = TheCall->getArg(2)->getType();
+    ASTContext::BuiltinVectorTypeInfo Info =
+        Context.getBuiltinVectorTypeInfo(Op1Type->castAs<BuiltinType>());
+    uint64_t ElemSize = Context.getTypeSize(Info.ElementType);
+    if (ElemSize == 64 && !TI.hasFeature("zvknhb"))
+      return Diag(TheCall->getBeginLoc(),
+                  diag::err_riscv_builtin_requires_extension)
+             << /* IsExtension */ true << TheCall->getSourceRange() << "zvknb";
+
+    return CheckInvalidVLENandLMUL(TI, TheCall, SemaRef, Op1Type,
+                                   ElemSize * 4) ||
+           CheckInvalidVLENandLMUL(TI, TheCall, SemaRef, Op2Type,
+                                   ElemSize * 4) ||
+           CheckInvalidVLENandLMUL(TI, TheCall, SemaRef, Op3Type, ElemSize * 4);
+  }
+
+  case RISCVVector::BI__builtin_rvv_sf_vc_i_se:
+    // bit_27_26, bit_24_20, bit_11_7, simm5, sew, log2lmul
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 31) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 31) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 3, -16, 15) ||
+           CheckLMUL(TheCall, 5);
+  case RISCVVector::BI__builtin_rvv_sf_vc_iv_se:
+    // bit_27_26, bit_11_7, vs2, simm5
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 31) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 3, -16, 15);
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_i:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_i_se:
+    // bit_27_26, bit_24_20, simm5
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 31) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 2, -16, 15);
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_iv:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_iv_se:
+    // bit_27_26, vs2, simm5
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 2, -16, 15);
+  case RISCVVector::BI__builtin_rvv_sf_vc_ivv_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_ivw_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_ivv:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_ivw:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_ivv_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_ivw_se:
+    // bit_27_26, vd, vs2, simm5
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 3, -16, 15);
+  case RISCVVector::BI__builtin_rvv_sf_vc_x_se:
+    // bit_27_26, bit_24_20, bit_11_7, xs1, sew, log2lmul
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 31) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 31) ||
+           CheckLMUL(TheCall, 5);
+  case RISCVVector::BI__builtin_rvv_sf_vc_xv_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_vv_se:
+    // bit_27_26, bit_11_7, vs2, xs1/vs1
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_x:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_x_se:
+    // bit_27_26, bit_24-20, xs1
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 3) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 31);
+  case RISCVVector::BI__builtin_rvv_sf_vc_vvv_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_xvv_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_vvw_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_xvw_se:
+    // bit_27_26, vd, vs2, xs1
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_xv:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_vv:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_xv_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_vv_se:
+    // bit_27_26, vs2, xs1/vs1
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_xvv:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_vvv:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_xvw:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_vvw:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_xvv_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_vvv_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_xvw_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_vvw_se:
+    // bit_27_26, vd, vs2, xs1/vs1
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 3);
+  case RISCVVector::BI__builtin_rvv_sf_vc_fv_se:
+    // bit_26, bit_11_7, vs2, fs1
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 1) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 31);
+  case RISCVVector::BI__builtin_rvv_sf_vc_fvv_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_fvw_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_fvv:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_fvw:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_fvv_se:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_fvw_se:
+    // bit_26, vd, vs2, fs1
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_fv:
+  case RISCVVector::BI__builtin_rvv_sf_vc_v_fv_se:
+    // bit_26, vs2, fs1
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 1);
+  // Check if byteselect is in [0, 3]
+  case RISCV::BI__builtin_riscv_aes32dsi:
+  case RISCV::BI__builtin_riscv_aes32dsmi:
+  case RISCV::BI__builtin_riscv_aes32esi:
+  case RISCV::BI__builtin_riscv_aes32esmi:
+  case RISCV::BI__builtin_riscv_sm4ks:
+  case RISCV::BI__builtin_riscv_sm4ed:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 3);
+  // Check if rnum is in [0, 10]
+  case RISCV::BI__builtin_riscv_aes64ks1i:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 10);
+  // Check if value range for vxrm is in [0, 3]
+  case RISCVVector::BI__builtin_rvv_vaaddu_vv:
+  case RISCVVector::BI__builtin_rvv_vaaddu_vx:
+  case RISCVVector::BI__builtin_rvv_vaadd_vv:
+  case RISCVVector::BI__builtin_rvv_vaadd_vx:
+  case RISCVVector::BI__builtin_rvv_vasubu_vv:
+  case RISCVVector::BI__builtin_rvv_vasubu_vx:
+  case RISCVVector::BI__builtin_rvv_vasub_vv:
+  case RISCVVector::BI__builtin_rvv_vasub_vx:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx:
+  case RISCVVector::BI__builtin_rvv_vssra_vv:
+  case RISCVVector::BI__builtin_rvv_vssra_vx:
+  case RISCVVector::BI__builtin_rvv_vssrl_vv:
+  case RISCVVector::BI__builtin_rvv_vssrl_vx:
+  case RISCVVector::BI__builtin_rvv_vnclip_wv:
+  case RISCVVector::BI__builtin_rvv_vnclip_wx:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wv:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wx:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 3);
+  case RISCVVector::BI__builtin_rvv_vaaddu_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vaaddu_vx_tu:
+  case RISCVVector::BI__builtin_rvv_vaadd_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vaadd_vx_tu:
+  case RISCVVector::BI__builtin_rvv_vasubu_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vasubu_vx_tu:
+  case RISCVVector::BI__builtin_rvv_vasub_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vasub_vx_tu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx_tu:
+  case RISCVVector::BI__builtin_rvv_vssra_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vssra_vx_tu:
+  case RISCVVector::BI__builtin_rvv_vssrl_vv_tu:
+  case RISCVVector::BI__builtin_rvv_vssrl_vx_tu:
+  case RISCVVector::BI__builtin_rvv_vnclip_wv_tu:
+  case RISCVVector::BI__builtin_rvv_vnclip_wx_tu:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wv_tu:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wx_tu:
+  case RISCVVector::BI__builtin_rvv_vaaddu_vv_m:
+  case RISCVVector::BI__builtin_rvv_vaaddu_vx_m:
+  case RISCVVector::BI__builtin_rvv_vaadd_vv_m:
+  case RISCVVector::BI__builtin_rvv_vaadd_vx_m:
+  case RISCVVector::BI__builtin_rvv_vasubu_vv_m:
+  case RISCVVector::BI__builtin_rvv_vasubu_vx_m:
+  case RISCVVector::BI__builtin_rvv_vasub_vv_m:
+  case RISCVVector::BI__builtin_rvv_vasub_vx_m:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv_m:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx_m:
+  case RISCVVector::BI__builtin_rvv_vssra_vv_m:
+  case RISCVVector::BI__builtin_rvv_vssra_vx_m:
+  case RISCVVector::BI__builtin_rvv_vssrl_vv_m:
+  case RISCVVector::BI__builtin_rvv_vssrl_vx_m:
+  case RISCVVector::BI__builtin_rvv_vnclip_wv_m:
+  case RISCVVector::BI__builtin_rvv_vnclip_wx_m:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wv_m:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wx_m:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 3, 0, 3);
+  case RISCVVector::BI__builtin_rvv_vaaddu_vv_tum:
+  case RISCVVector::BI__builtin_rvv_vaaddu_vv_tumu:
+  case RISCVVector::BI__builtin_rvv_vaaddu_vv_mu:
+  case RISCVVector::BI__builtin_rvv_vaaddu_vx_tum:
+  case RISCVVector::BI__builtin_rvv_vaaddu_vx_tumu:
+  case RISCVVector::BI__builtin_rvv_vaaddu_vx_mu:
+  case RISCVVector::BI__builtin_rvv_vaadd_vv_tum:
+  case RISCVVector::BI__builtin_rvv_vaadd_vv_tumu:
+  case RISCVVector::BI__builtin_rvv_vaadd_vv_mu:
+  case RISCVVector::BI__builtin_rvv_vaadd_vx_tum:
+  case RISCVVector::BI__builtin_rvv_vaadd_vx_tumu:
+  case RISCVVector::BI__builtin_rvv_vaadd_vx_mu:
+  case RISCVVector::BI__builtin_rvv_vasubu_vv_tum:
+  case RISCVVector::BI__builtin_rvv_vasubu_vv_tumu:
+  case RISCVVector::BI__builtin_rvv_vasubu_vv_mu:
+  case RISCVVector::BI__builtin_rvv_vasubu_vx_tum:
+  case RISCVVector::BI__builtin_rvv_vasubu_vx_tumu:
+  case RISCVVector::BI__builtin_rvv_vasubu_vx_mu:
+  case RISCVVector::BI__builtin_rvv_vasub_vv_tum:
+  case RISCVVector::BI__builtin_rvv_vasub_vv_tumu:
+  case RISCVVector::BI__builtin_rvv_vasub_vv_mu:
+  case RISCVVector::BI__builtin_rvv_vasub_vx_tum:
+  case RISCVVector::BI__builtin_rvv_vasub_vx_tumu:
+  case RISCVVector::BI__builtin_rvv_vasub_vx_mu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv_mu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx_mu:
+  case RISCVVector::BI__builtin_rvv_vssra_vv_mu:
+  case RISCVVector::BI__builtin_rvv_vssra_vx_mu:
+  case RISCVVector::BI__builtin_rvv_vssrl_vv_mu:
+  case RISCVVector::BI__builtin_rvv_vssrl_vx_mu:
+  case RISCVVector::BI__builtin_rvv_vnclip_wv_mu:
+  case RISCVVector::BI__builtin_rvv_vnclip_wx_mu:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wv_mu:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wx_mu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv_tum:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx_tum:
+  case RISCVVector::BI__builtin_rvv_vssra_vv_tum:
+  case RISCVVector::BI__builtin_rvv_vssra_vx_tum:
+  case RISCVVector::BI__builtin_rvv_vssrl_vv_tum:
+  case RISCVVector::BI__builtin_rvv_vssrl_vx_tum:
+  case RISCVVector::BI__builtin_rvv_vnclip_wv_tum:
+  case RISCVVector::BI__builtin_rvv_vnclip_wx_tum:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wv_tum:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wx_tum:
+  case RISCVVector::BI__builtin_rvv_vsmul_vv_tumu:
+  case RISCVVector::BI__builtin_rvv_vsmul_vx_tumu:
+  case RISCVVector::BI__builtin_rvv_vssra_vv_tumu:
+  case RISCVVector::BI__builtin_rvv_vssra_vx_tumu:
+  case RISCVVector::BI__builtin_rvv_vssrl_vv_tumu:
+  case RISCVVector::BI__builtin_rvv_vssrl_vx_tumu:
+  case RISCVVector::BI__builtin_rvv_vnclip_wv_tumu:
+  case RISCVVector::BI__builtin_rvv_vnclip_wx_tumu:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wv_tumu:
+  case RISCVVector::BI__builtin_rvv_vnclipu_wx_tumu:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 4, 0, 3);
+  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm:
+  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm:
+  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm:
+  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm:
+  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm:
+  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 4);
+  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm:
+  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfredosum_vs_rm:
+  case RISCVVector::BI__builtin_rvv_vfredusum_vs_rm:
+  case RISCVVector::BI__builtin_rvv_vfwredosum_vs_rm:
+  case RISCVVector::BI__builtin_rvv_vfwredusum_vs_rm:
+  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm_m:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 4);
+  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfredosum_vs_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfredusum_vs_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwredosum_vs_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwredusum_vs_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm:
+  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm_tu:
+  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfredosum_vs_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfredusum_vs_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwredosum_vs_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwredusum_vs_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfsqrt_v_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfrec7_v_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_x_f_v_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_xu_f_v_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_x_v_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfcvt_f_xu_v_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_x_f_v_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwcvt_xu_f_v_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_x_f_w_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_xu_f_w_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_x_w_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_xu_w_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfncvt_f_f_w_rm_mu:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 3, 0, 4);
+  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm_m:
+  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfredosum_vs_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfredusum_vs_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwredosum_vs_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfwredusum_vs_rm_tum:
+  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm_tumu:
+  case RISCVVector::BI__builtin_rvv_vfadd_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfadd_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfsub_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfsub_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfrsub_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwadd_wf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwsub_wf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfmul_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfmul_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfdiv_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfrdiv_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwmul_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfmacc_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfmacc_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfnmacc_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfmsac_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfnmsac_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfmadd_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfnmadd_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfmsub_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfnmsub_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwmacc_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwnmacc_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwmsac_vf_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vv_rm_mu:
+  case RISCVVector::BI__builtin_rvv_vfwnmsac_vf_rm_mu:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 4, 0, 4);
+  case RISCV::BI__builtin_riscv_ntl_load:
+  case RISCV::BI__builtin_riscv_ntl_store:
+    DeclRefExpr *DRE =
+        cast<DeclRefExpr>(TheCall->getCallee()->IgnoreParenCasts());
+    assert((BuiltinID == RISCV::BI__builtin_riscv_ntl_store ||
+            BuiltinID == RISCV::BI__builtin_riscv_ntl_load) &&
+           "Unexpected RISC-V nontemporal load/store builtin!");
+    bool IsStore = BuiltinID == RISCV::BI__builtin_riscv_ntl_store;
+    unsigned NumArgs = IsStore ? 3 : 2;
+
+    if (SemaRef.checkArgCountAtLeast(TheCall, NumArgs - 1))
+      return true;
+
+    if (SemaRef.checkArgCountAtMost(TheCall, NumArgs))
+      return true;
+
+    // Domain value should be compile-time constant.
+    // 2 <= domain <= 5
+    if (TheCall->getNumArgs() == NumArgs &&
+        SemaRef.BuiltinConstantArgRange(TheCall, NumArgs - 1, 2, 5))
+      return true;
+
+    Expr *PointerArg = TheCall->getArg(0);
+    ExprResult PointerArgResult =
+        SemaRef.DefaultFunctionArrayLvalueConversion(PointerArg);
+
+    if (PointerArgResult.isInvalid())
+      return true;
+    PointerArg = PointerArgResult.get();
+
+    const PointerType *PtrType = PointerArg->getType()->getAs<PointerType>();
+    if (!PtrType) {
+      Diag(DRE->getBeginLoc(), diag::err_nontemporal_builtin_must_be_pointer)
+          << PointerArg->getType() << PointerArg->getSourceRange();
+      return true;
+    }
+
+    QualType ValType = PtrType->getPointeeType();
+    ValType = ValType.getUnqualifiedType();
+    if (!ValType->isIntegerType() && !ValType->isAnyPointerType() &&
+        !ValType->isBlockPointerType() && !ValType->isFloatingType() &&
+        !ValType->isVectorType() && !ValType->isRVVSizelessBuiltinType()) {
+      Diag(DRE->getBeginLoc(),
+           diag::err_nontemporal_builtin_must_be_pointer_intfltptr_or_vector)
+          << PointerArg->getType() << PointerArg->getSourceRange();
+      return true;
+    }
+
+    if (!IsStore) {
+      TheCall->setType(ValType);
+      return false;
+    }
+
+    ExprResult ValArg = TheCall->getArg(1);
+    InitializedEntity Entity = InitializedEntity::InitializeParameter(
+        Context, ValType, /*consume*/ false);
+    ValArg =
+        SemaRef.PerformCopyInitialization(Entity, SourceLocation(), ValArg);
+    if (ValArg.isInvalid())
+      return true;
+
+    TheCall->setArg(1, ValArg.get());
+    TheCall->setType(Context.VoidTy);
+    return false;
+  }
+
+  return false;
+}
+
+void SemaRISCV::checkRVVTypeSupport(QualType Ty, SourceLocation Loc, Decl *D,
+                                    const llvm::StringMap<bool> &FeatureMap) {
+  ASTContext::BuiltinVectorTypeInfo Info =
+      SemaRef.Context.getBuiltinVectorTypeInfo(Ty->castAs<BuiltinType>());
+  unsigned EltSize = SemaRef.Context.getTypeSize(Info.ElementType);
+  unsigned MinElts = Info.EC.getKnownMinValue();
+
+  if (Info.ElementType->isSpecificBuiltinType(BuiltinType::Double) &&
+      !FeatureMap.lookup("zve64d"))
+    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve64d";
+  // (ELEN, LMUL) pairs of (8, mf8), (16, mf4), (32, mf2), (64, m1) requires at
+  // least zve64x
+  else if (((EltSize == 64 && Info.ElementType->isIntegerType()) ||
+            MinElts == 1) &&
+           !FeatureMap.lookup("zve64x"))
+    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve64x";
+  else if (Info.ElementType->isFloat16Type() && !FeatureMap.lookup("zvfh") &&
+           !FeatureMap.lookup("zvfhmin"))
+    Diag(Loc, diag::err_riscv_type_requires_extension, D)
+        << Ty << "zvfh or zvfhmin";
+  else if (Info.ElementType->isBFloat16Type() &&
+           !FeatureMap.lookup("experimental-zvfbfmin"))
+    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zvfbfmin";
+  else if (Info.ElementType->isSpecificBuiltinType(BuiltinType::Float) &&
+           !FeatureMap.lookup("zve32f"))
+    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve32f";
+  // Given that caller already checked isRVVType() before calling this function,
+  // if we don't have at least zve32x supported, then we need to emit error.
+  else if (!FeatureMap.lookup("zve32x"))
+    Diag(Loc, diag::err_riscv_type_requires_extension, D) << Ty << "zve32x";
+}
+
+/// Are the two types RVV-bitcast-compatible types? I.e. is bitcasting from the
+/// first RVV type (e.g. an RVV scalable type) to the second type (e.g. an RVV
+/// VLS type) allowed?
+///
+/// This will also return false if the two given types do not make sense from
+/// the perspective of RVV bitcasts.
+bool SemaRISCV::isValidRVVBitcast(QualType srcTy, QualType destTy) {
+  assert(srcTy->isVectorType() || destTy->isVectorType());
+
+  auto ValidScalableConversion = [](QualType FirstType, QualType SecondType) {
+    if (!FirstType->isRVVSizelessBuiltinType())
+      return false;
+
+    const auto *VecTy = SecondType->getAs<VectorType>();
+    return VecTy && VecTy->getVectorKind() == VectorKind::RVVFixedLengthData;
+  };
+
+  return ValidScalableConversion(srcTy, destTy) ||
+         ValidScalableConversion(destTy, srcTy);
+}
+
+SemaRISCV::SemaRISCV(Sema &S) : SemaBase(S) {}
+
 } // namespace clang
diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp
index 36f8ecadcfab..8735d96c8407 100644
--- a/clang/lib/Sema/SemaStmtAttr.cpp
+++ b/clang/lib/Sema/SemaStmtAttr.cpp
@@ -665,7 +665,8 @@ bool Sema::CheckRebuiltStmtAttributes(ArrayRef<const Attr *> Attrs) {
 ExprResult Sema::ActOnCXXAssumeAttr(Stmt *St, const ParsedAttr &A,
                                     SourceRange Range) {
   if (A.getNumArgs() != 1 || !A.getArgAsExpr(0)) {
-    Diag(A.getLoc(), diag::err_assume_attr_args) << A.getAttrName() << Range;
+    Diag(A.getLoc(), diag::err_attribute_wrong_number_arguments)
+        << A.getAttrName() << 1 << Range;
     return ExprError();
   }
 
@@ -682,8 +683,11 @@ ExprResult Sema::ActOnCXXAssumeAttr(Stmt *St, const ParsedAttr &A,
     Assumption = Res.get();
   }
 
-  if (!getLangOpts().CPlusPlus23)
+  if (!getLangOpts().CPlusPlus23 &&
+      A.getSyntax() == AttributeCommonInfo::AS_CXX11) {
+    llvm::dbgs() << "Syntax: " << int(A.getSyntax()) << "\n";
     Diag(A.getLoc(), diag::ext_cxx23_attr) << A << Range;
+  }
 
   return Assumption;
 }
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 02d9b64c2b14..39e9dbed0c3e 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -1071,7 +1071,8 @@ NamedDecl *Sema::ActOnTypeParameter(Scope *S, bool Typename,
       return Param;
     }
 
-    Param->setDefaultArgument(DefaultTInfo);
+    Param->setDefaultArgument(
+        Context, TemplateArgumentLoc(DefaultTInfo->getType(), DefaultTInfo));
   }
 
   return Param;
@@ -1598,7 +1599,9 @@ NamedDecl *Sema::ActOnNonTypeTemplateParameter(Scope *S, Declarator &D,
     if (DiagnoseUnexpandedParameterPack(Default, UPPC_DefaultArgument))
       return Param;
 
-    Param->setDefaultArgument(Default);
+    Param->setDefaultArgument(
+        Context, getTrivialTemplateArgumentLoc(TemplateArgument(Default),
+                                               QualType(), SourceLocation()));
   }
 
   return Param;
@@ -1839,7 +1842,8 @@ DeclResult Sema::CheckClassTemplate(
     TemplateParameterList **OuterTemplateParamLists, SkipBodyInfo *SkipBody) {
   assert(TemplateParams && TemplateParams->size() > 0 &&
          "No template parameters");
-  assert(TUK != TUK_Reference && "Can only declare or define class templates");
+  assert(TUK != TagUseKind::Reference &&
+         "Can only declare or define class templates");
   bool Invalid = false;
 
   // Check that we can declare a template here.
@@ -1861,8 +1865,9 @@ DeclResult Sema::CheckClassTemplate(
   // C++11 [basic.lookup.elab]p2).
   DeclContext *SemanticContext;
   LookupResult Previous(*this, Name, NameLoc,
-                        (SS.isEmpty() && TUK == TUK_Friend)
-                          ? LookupTagName : LookupOrdinaryName,
+                        (SS.isEmpty() && TUK == TagUseKind::Friend)
+                            ? LookupTagName
+                            : LookupOrdinaryName,
                         forRedeclarationInCurContext());
   if (SS.isNotEmpty() && !SS.isInvalid()) {
     SemanticContext = computeDeclContext(SS, true);
@@ -1870,11 +1875,11 @@ DeclResult Sema::CheckClassTemplate(
       // FIXME: Horrible, horrible hack! We can't currently represent this
       // in the AST, and historically we have just ignored such friend
       // class templates, so don't complain here.
-      Diag(NameLoc, TUK == TUK_Friend
+      Diag(NameLoc, TUK == TagUseKind::Friend
                         ? diag::warn_template_qualified_friend_ignored
                         : diag::err_template_qualified_declarator_no_match)
           << SS.getScopeRep() << SS.getRange();
-      return TUK != TUK_Friend;
+      return TUK != TagUseKind::Friend;
     }
 
     if (RequireCompleteDeclContext(SS, SemanticContext))
@@ -1889,7 +1894,7 @@ DeclResult Sema::CheckClassTemplate(
         Invalid = true;
     }
 
-    if (TUK != TUK_Friend && TUK != TUK_Reference)
+    if (TUK != TagUseKind::Friend && TUK != TagUseKind::Reference)
       diagnoseQualifiedDeclaration(SS, SemanticContext, Name, NameLoc,
                                    /*TemplateId-*/ nullptr,
                                    /*IsMemberSpecialization*/ false);
@@ -1902,7 +1907,7 @@ DeclResult Sema::CheckClassTemplate(
     //   If T is the name of a class, then each of the following shall have a
     //   name different from T:
     //    -- every member template of class T
-    if (TUK != TUK_Friend &&
+    if (TUK != TagUseKind::Friend &&
         DiagnoseClassNameShadow(SemanticContext,
                                 DeclarationNameInfo(Name, NameLoc)))
       return true;
@@ -1944,7 +1949,7 @@ DeclResult Sema::CheckClassTemplate(
     }
   }
 
-  if (TUK == TUK_Friend) {
+  if (TUK == TagUseKind::Friend) {
     // C++ [namespace.memdef]p3:
     //   [...] When looking for a prior declaration of a class or a function
     //   declared as a friend, and when the name of the friend class or
@@ -1981,9 +1986,8 @@ DeclResult Sema::CheckClassTemplate(
           PrevDecl = (*Previous.begin())->getUnderlyingDecl();
       }
     }
-  } else if (PrevDecl &&
-             !isDeclInScope(Previous.getRepresentativeDecl(), SemanticContext,
-                            S, SS.isValid()))
+  } else if (PrevDecl && !isDeclInScope(Previous.getRepresentativeDecl(),
+                                        SemanticContext, S, SS.isValid()))
     PrevDecl = PrevClassTemplate = nullptr;
 
   if (auto *Shadow = dyn_cast_or_null<UsingShadowDecl>(
@@ -2005,7 +2009,7 @@ DeclResult Sema::CheckClassTemplate(
     // Ensure that the template parameter lists are compatible. Skip this check
     // for a friend in a dependent context: the template parameter list itself
     // could be dependent.
-    if (!(TUK == TUK_Friend && CurContext->isDependentContext()) &&
+    if (!(TUK == TagUseKind::Friend && CurContext->isDependentContext()) &&
         !TemplateParameterListsAreEqual(
             TemplateCompareNewDeclInfo(SemanticContext ? SemanticContext
                                                        : CurContext,
@@ -2021,8 +2025,8 @@ DeclResult Sema::CheckClassTemplate(
     //   the class-key shall agree in kind with the original class
     //   template declaration (7.1.5.3).
     RecordDecl *PrevRecordDecl = PrevClassTemplate->getTemplatedDecl();
-    if (!isAcceptableTagRedeclaration(PrevRecordDecl, Kind,
-                                      TUK == TUK_Definition,  KWLoc, Name)) {
+    if (!isAcceptableTagRedeclaration(
+            PrevRecordDecl, Kind, TUK == TagUseKind::Definition, KWLoc, Name)) {
       Diag(KWLoc, diag::err_use_with_wrong_tag)
         << Name
         << FixItHint::CreateReplacement(KWLoc, PrevRecordDecl->getKindName());
@@ -2031,7 +2035,7 @@ DeclResult Sema::CheckClassTemplate(
     }
 
     // Check for redefinition of this class template.
-    if (TUK == TUK_Definition) {
+    if (TUK == TagUseKind::Definition) {
       if (TagDecl *Def = PrevRecordDecl->getDefinition()) {
         // If we have a prior definition that is not visible, treat this as
         // simply making that previous definition visible.
@@ -2068,7 +2072,7 @@ DeclResult Sema::CheckClassTemplate(
   // merging in the template parameter list from the previous class
   // template declaration. Skip this check for a friend in a dependent
   // context, because the template parameter list might be dependent.
-  if (!(TUK == TUK_Friend && CurContext->isDependentContext()) &&
+  if (!(TUK == TagUseKind::Friend && CurContext->isDependentContext()) &&
       CheckTemplateParameterList(
           TemplateParams,
           PrevClassTemplate ? GetTemplateParameterList(PrevClassTemplate)
@@ -2076,8 +2080,8 @@ DeclResult Sema::CheckClassTemplate(
           (SS.isSet() && SemanticContext && SemanticContext->isRecord() &&
            SemanticContext->isDependentContext())
               ? TPC_ClassTemplateMember
-          : TUK == TUK_Friend ? TPC_FriendClassTemplate
-                              : TPC_ClassTemplate,
+          : TUK == TagUseKind::Friend ? TPC_FriendClassTemplate
+                                      : TPC_ClassTemplate,
           SkipBody))
     Invalid = true;
 
@@ -2085,9 +2089,10 @@ DeclResult Sema::CheckClassTemplate(
     // If the name of the template was qualified, we must be defining the
     // template out-of-line.
     if (!SS.isInvalid() && !Invalid && !PrevClassTemplate) {
-      Diag(NameLoc, TUK == TUK_Friend ? diag::err_friend_decl_does_not_match
-                                      : diag::err_member_decl_does_not_match)
-        << Name << SemanticContext << /*IsDefinition*/true << SS.getRange();
+      Diag(NameLoc, TUK == TagUseKind::Friend
+                        ? diag::err_friend_decl_does_not_match
+                        : diag::err_member_decl_does_not_match)
+          << Name << SemanticContext << /*IsDefinition*/ true << SS.getRange();
       Invalid = true;
     }
   }
@@ -2097,8 +2102,8 @@ DeclResult Sema::CheckClassTemplate(
   // recent declaration tricking the template instantiator to make substitutions
   // there.
   // FIXME: Figure out how to combine with shouldLinkDependentDeclWithPrevious
-  bool ShouldAddRedecl
-    = !(TUK == TUK_Friend && CurContext->isDependentContext());
+  bool ShouldAddRedecl =
+      !(TUK == TagUseKind::Friend && CurContext->isDependentContext());
 
   CXXRecordDecl *NewClass =
     CXXRecordDecl::Create(Context, Kind, SemanticContext, KWLoc, NameLoc, Name,
@@ -2113,7 +2118,7 @@ DeclResult Sema::CheckClassTemplate(
 
   // Add alignment attributes if necessary; these attributes are checked when
   // the ASTContext lays out the structure.
-  if (TUK == TUK_Definition && (!SkipBody || !SkipBody->ShouldSkip)) {
+  if (TUK == TagUseKind::Definition && (!SkipBody || !SkipBody->ShouldSkip)) {
     AddAlignmentAttributesForRecord(NewClass);
     AddMsStructLayoutForRecord(NewClass);
   }
@@ -2144,14 +2149,15 @@ DeclResult Sema::CheckClassTemplate(
     PrevClassTemplate->setMemberSpecialization();
 
   // Set the access specifier.
-  if (!Invalid && TUK != TUK_Friend && NewTemplate->getDeclContext()->isRecord())
+  if (!Invalid && TUK != TagUseKind::Friend &&
+      NewTemplate->getDeclContext()->isRecord())
     SetMemberAccessSpecifier(NewTemplate, PrevClassTemplate, AS);
 
   // Set the lexical context of these templates
   NewClass->setLexicalDeclContext(CurContext);
   NewTemplate->setLexicalDeclContext(CurContext);
 
-  if (TUK == TUK_Definition && (!SkipBody || !SkipBody->ShouldSkip))
+  if (TUK == TagUseKind::Definition && (!SkipBody || !SkipBody->ShouldSkip))
     NewClass->startDefinition();
 
   ProcessDeclAttributeList(S, NewClass, Attr);
@@ -2164,7 +2170,7 @@ DeclResult Sema::CheckClassTemplate(
   inferGslOwnerPointerAttribute(NewClass);
   inferNullableClassAttribute(NewClass);
 
-  if (TUK != TUK_Friend) {
+  if (TUK != TagUseKind::Friend) {
     // Per C++ [basic.scope.temp]p2, skip the template parameter scopes.
     Scope *Outer = S;
     while ((Outer->getFlags() & Scope::TemplateParamScope) != 0)
@@ -2318,11 +2324,11 @@ transformTemplateTypeParam(Sema &SemaRef, DeclContext *DC,
     SemaRef.SubstTypeConstraint(NewTTP, TC, Args,
                                 /*EvaluateConstraint=*/true);
   if (TTP->hasDefaultArgument()) {
-    TypeSourceInfo *InstantiatedDefaultArg =
-        SemaRef.SubstType(TTP->getDefaultArgumentInfo(), Args,
-                          TTP->getDefaultArgumentLoc(), TTP->getDeclName());
-    if (InstantiatedDefaultArg)
-      NewTTP->setDefaultArgument(InstantiatedDefaultArg);
+    TemplateArgumentLoc InstantiatedDefaultArg;
+    if (!SemaRef.SubstTemplateArgument(
+            TTP->getDefaultArgument(), Args, InstantiatedDefaultArg,
+            TTP->getDefaultArgumentLoc(), TTP->getDeclName()))
+      NewTTP->setDefaultArgument(SemaRef.Context, InstantiatedDefaultArg);
   }
   SemaRef.CurrentInstantiationScope->InstantiatedLocal(TTP, NewTTP);
   return NewTTP;
@@ -3575,10 +3581,9 @@ bool Sema::CheckTemplateParameterList(TemplateParameterList *NewParams,
           = dyn_cast<TemplateTypeParmDecl>(*NewParam)) {
       // Check the presence of a default argument here.
       if (NewTypeParm->hasDefaultArgument() &&
-          DiagnoseDefaultTemplateArgument(*this, TPC,
-                                          NewTypeParm->getLocation(),
-               NewTypeParm->getDefaultArgumentInfo()->getTypeLoc()
-                                                       .getSourceRange()))
+          DiagnoseDefaultTemplateArgument(
+              *this, TPC, NewTypeParm->getLocation(),
+              NewTypeParm->getDefaultArgument().getSourceRange()))
         NewTypeParm->removeDefaultArgument();
 
       // Merge default arguments for template type parameters.
@@ -3627,9 +3632,9 @@ bool Sema::CheckTemplateParameterList(TemplateParameterList *NewParams,
 
       // Check the presence of a default argument here.
       if (NewNonTypeParm->hasDefaultArgument() &&
-          DiagnoseDefaultTemplateArgument(*this, TPC,
-                                          NewNonTypeParm->getLocation(),
-                    NewNonTypeParm->getDefaultArgument()->getSourceRange())) {
+          DiagnoseDefaultTemplateArgument(
+              *this, TPC, NewNonTypeParm->getLocation(),
+              NewNonTypeParm->getDefaultArgument().getSourceRange())) {
         NewNonTypeParm->removeDefaultArgument();
       }
 
@@ -5015,7 +5020,7 @@ TypeResult Sema::ActOnTagTemplateIdType(TagUseKind TUK,
     IdentifierInfo *Id = D->getIdentifier();
     assert(Id && "templated class must have an identifier");
 
-    if (!isAcceptableTagRedeclaration(D, TagKind, TUK == TUK_Definition,
+    if (!isAcceptableTagRedeclaration(D, TagKind, TUK == TagUseKind::Definition,
                                       TagLoc, Id)) {
       Diag(TagLoc, diag::err_use_with_wrong_tag)
         << Result
@@ -6040,22 +6045,26 @@ bool Sema::CheckTemplateTypeArgument(
 ///
 /// \param Converted the list of template arguments provided for template
 /// parameters that precede \p Param in the template parameter list.
-/// \returns the substituted template argument, or NULL if an error occurred.
-static TypeSourceInfo *SubstDefaultTemplateArgument(
+///
+/// \param Output the resulting substituted template argument.
+///
+/// \returns true if an error occurred.
+static bool SubstDefaultTemplateArgument(
     Sema &SemaRef, TemplateDecl *Template, SourceLocation TemplateLoc,
     SourceLocation RAngleLoc, TemplateTypeParmDecl *Param,
     ArrayRef<TemplateArgument> SugaredConverted,
-    ArrayRef<TemplateArgument> CanonicalConverted) {
-  TypeSourceInfo *ArgType = Param->getDefaultArgumentInfo();
+    ArrayRef<TemplateArgument> CanonicalConverted,
+    TemplateArgumentLoc &Output) {
+  Output = Param->getDefaultArgument();
 
   // If the argument type is dependent, instantiate it now based
   // on the previously-computed template arguments.
-  if (ArgType->getType()->isInstantiationDependentType()) {
+  if (Output.getArgument().isInstantiationDependent()) {
     Sema::InstantiatingTemplate Inst(SemaRef, TemplateLoc, Param, Template,
                                      SugaredConverted,
                                      SourceRange(TemplateLoc, RAngleLoc));
     if (Inst.isInvalid())
-      return nullptr;
+      return true;
 
     // Only substitute for the innermost template argument list.
     MultiLevelTemplateArgumentList TemplateArgLists(Template, SugaredConverted,
@@ -6068,12 +6077,14 @@ static TypeSourceInfo *SubstDefaultTemplateArgument(
       ForLambdaCallOperator = Rec->isLambda();
     Sema::ContextRAII SavedContext(SemaRef, Template->getDeclContext(),
                                    !ForLambdaCallOperator);
-    ArgType =
-        SemaRef.SubstType(ArgType, TemplateArgLists,
-                          Param->getDefaultArgumentLoc(), Param->getDeclName());
+
+    if (SemaRef.SubstTemplateArgument(Output, TemplateArgLists, Output,
+                                      Param->getDefaultArgumentLoc(),
+                                      Param->getDeclName()))
+      return true;
   }
 
-  return ArgType;
+  return false;
 }
 
 /// Substitute template arguments into the default template argument for
@@ -6098,16 +6109,17 @@ static TypeSourceInfo *SubstDefaultTemplateArgument(
 /// parameters that precede \p Param in the template parameter list.
 ///
 /// \returns the substituted template argument, or NULL if an error occurred.
-static ExprResult SubstDefaultTemplateArgument(
+static bool SubstDefaultTemplateArgument(
     Sema &SemaRef, TemplateDecl *Template, SourceLocation TemplateLoc,
     SourceLocation RAngleLoc, NonTypeTemplateParmDecl *Param,
     ArrayRef<TemplateArgument> SugaredConverted,
-    ArrayRef<TemplateArgument> CanonicalConverted) {
+    ArrayRef<TemplateArgument> CanonicalConverted,
+    TemplateArgumentLoc &Output) {
   Sema::InstantiatingTemplate Inst(SemaRef, TemplateLoc, Param, Template,
                                    SugaredConverted,
                                    SourceRange(TemplateLoc, RAngleLoc));
   if (Inst.isInvalid())
-    return ExprError();
+    return true;
 
   // Only substitute for the innermost template argument list.
   MultiLevelTemplateArgumentList TemplateArgLists(Template, SugaredConverted,
@@ -6118,7 +6130,8 @@ static ExprResult SubstDefaultTemplateArgument(
   Sema::ContextRAII SavedContext(SemaRef, Template->getDeclContext());
   EnterExpressionEvaluationContext ConstantEvaluated(
       SemaRef, Sema::ExpressionEvaluationContext::ConstantEvaluated);
-  return SemaRef.SubstExpr(Param->getDefaultArgument(), TemplateArgLists);
+  return SemaRef.SubstTemplateArgument(Param->getDefaultArgument(),
+                                       TemplateArgLists, Output);
 }
 
 /// Substitute template arguments into the default template argument for
@@ -6196,13 +6209,12 @@ TemplateArgumentLoc Sema::SubstDefaultTemplateArgumentIfAvailable(
       return TemplateArgumentLoc();
 
     HasDefaultArg = true;
-    TypeSourceInfo *DI = SubstDefaultTemplateArgument(
-        *this, Template, TemplateLoc, RAngleLoc, TypeParm, SugaredConverted,
-        CanonicalConverted);
-    if (DI)
-      return TemplateArgumentLoc(TemplateArgument(DI->getType()), DI);
-
-    return TemplateArgumentLoc();
+    TemplateArgumentLoc Output;
+    if (SubstDefaultTemplateArgument(*this, Template, TemplateLoc, RAngleLoc,
+                                     TypeParm, SugaredConverted,
+                                     CanonicalConverted, Output))
+      return TemplateArgumentLoc();
+    return Output;
   }
 
   if (NonTypeTemplateParmDecl *NonTypeParm
@@ -6211,14 +6223,12 @@ TemplateArgumentLoc Sema::SubstDefaultTemplateArgumentIfAvailable(
       return TemplateArgumentLoc();
 
     HasDefaultArg = true;
-    ExprResult Arg = SubstDefaultTemplateArgument(
-        *this, Template, TemplateLoc, RAngleLoc, NonTypeParm, SugaredConverted,
-        CanonicalConverted);
-    if (Arg.isInvalid())
+    TemplateArgumentLoc Output;
+    if (SubstDefaultTemplateArgument(*this, Template, TemplateLoc, RAngleLoc,
+                                     NonTypeParm, SugaredConverted,
+                                     CanonicalConverted, Output))
       return TemplateArgumentLoc();
-
-    Expr *ArgE = Arg.getAs<Expr>();
-    return TemplateArgumentLoc(TemplateArgument(ArgE), ArgE);
+    return Output;
   }
 
   TemplateTemplateParmDecl *TempTempParm
@@ -6785,28 +6795,20 @@ bool Sema::CheckTemplateArgumentList(
         return diagnoseMissingArgument(*this, TemplateLoc, Template, TTP,
                                        NewArgs);
 
-      TypeSourceInfo *ArgType = SubstDefaultTemplateArgument(
-          *this, Template, TemplateLoc, RAngleLoc, TTP, SugaredConverted,
-          CanonicalConverted);
-      if (!ArgType)
+      if (SubstDefaultTemplateArgument(*this, Template, TemplateLoc, RAngleLoc,
+                                       TTP, SugaredConverted,
+                                       CanonicalConverted, Arg))
         return true;
-
-      Arg = TemplateArgumentLoc(TemplateArgument(ArgType->getType()),
-                                ArgType);
     } else if (NonTypeTemplateParmDecl *NTTP
                  = dyn_cast<NonTypeTemplateParmDecl>(*Param)) {
       if (!hasReachableDefaultArgument(NTTP))
         return diagnoseMissingArgument(*this, TemplateLoc, Template, NTTP,
                                        NewArgs);
 
-      ExprResult E = SubstDefaultTemplateArgument(
-          *this, Template, TemplateLoc, RAngleLoc, NTTP, SugaredConverted,
-          CanonicalConverted);
-      if (E.isInvalid())
+      if (SubstDefaultTemplateArgument(*this, Template, TemplateLoc, RAngleLoc,
+                                       NTTP, SugaredConverted,
+                                       CanonicalConverted, Arg))
         return true;
-
-      Expr *Ex = E.getAs<Expr>();
-      Arg = TemplateArgumentLoc(TemplateArgument(Ex), Ex);
     } else {
       TemplateTemplateParmDecl *TempParm
         = cast<TemplateTemplateParmDecl>(*Param);
@@ -9451,7 +9453,7 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
     SourceLocation ModulePrivateLoc, CXXScopeSpec &SS,
     TemplateIdAnnotation &TemplateId, const ParsedAttributesView &Attr,
     MultiTemplateParamsArg TemplateParameterLists, SkipBodyInfo *SkipBody) {
-  assert(TUK != TUK_Reference && "References are not specializations");
+  assert(TUK != TagUseKind::Reference && "References are not specializations");
 
   SourceLocation TemplateNameLoc = TemplateId.TemplateNameLoc;
   SourceLocation LAngleLoc = TemplateId.LAngleLoc;
@@ -9473,7 +9475,7 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
   bool isPartialSpecialization = false;
 
   if (SS.isSet()) {
-    if (TUK != TUK_Reference && TUK != TUK_Friend &&
+    if (TUK != TagUseKind::Reference && TUK != TagUseKind::Friend &&
         diagnoseQualifiedDeclaration(SS, ClassTemplate->getDeclContext(),
                                      ClassTemplate->getDeclName(),
                                      TemplateNameLoc, &TemplateId,
@@ -9488,9 +9490,8 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
   bool Invalid = false;
   TemplateParameterList *TemplateParams =
       MatchTemplateParametersToScopeSpecifier(
-          KWLoc, TemplateNameLoc, SS, &TemplateId,
-          TemplateParameterLists, TUK == TUK_Friend, isMemberSpecialization,
-          Invalid);
+          KWLoc, TemplateNameLoc, SS, &TemplateId, TemplateParameterLists,
+          TUK == TagUseKind::Friend, isMemberSpecialization, Invalid);
   if (Invalid)
     return true;
 
@@ -9501,7 +9502,7 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
   if (TemplateParams && TemplateParams->size() > 0) {
     isPartialSpecialization = true;
 
-    if (TUK == TUK_Friend) {
+    if (TUK == TagUseKind::Friend) {
       Diag(KWLoc, diag::err_partial_specialization_friend)
         << SourceRange(LAngleLoc, RAngleLoc);
       return true;
@@ -9520,10 +9521,10 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
         }
       } else if (NonTypeTemplateParmDecl *NTTP
                    = dyn_cast<NonTypeTemplateParmDecl>(Param)) {
-        if (Expr *DefArg = NTTP->getDefaultArgument()) {
+        if (NTTP->hasDefaultArgument()) {
           Diag(NTTP->getDefaultArgumentLoc(),
                diag::err_default_arg_in_partial_spec)
-            << DefArg->getSourceRange();
+              << NTTP->getDefaultArgument().getSourceRange();
           NTTP->removeDefaultArgument();
         }
       } else {
@@ -9537,14 +9538,15 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
       }
     }
   } else if (TemplateParams) {
-    if (TUK == TUK_Friend)
+    if (TUK == TagUseKind::Friend)
       Diag(KWLoc, diag::err_template_spec_friend)
         << FixItHint::CreateRemoval(
                                 SourceRange(TemplateParams->getTemplateLoc(),
                                             TemplateParams->getRAngleLoc()))
         << SourceRange(LAngleLoc, RAngleLoc);
   } else {
-    assert(TUK == TUK_Friend && "should have a 'template<>' for this decl");
+    assert(TUK == TagUseKind::Friend &&
+           "should have a 'template<>' for this decl");
   }
 
   // Check that the specialization uses the same tag kind as the
@@ -9552,8 +9554,8 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
   TagTypeKind Kind = TypeWithKeyword::getTagTypeKindForTypeSpec(TagSpec);
   assert(Kind != TagTypeKind::Enum &&
          "Invalid enum tag in class template spec!");
-  if (!isAcceptableTagRedeclaration(ClassTemplate->getTemplatedDecl(),
-                                    Kind, TUK == TUK_Definition, KWLoc,
+  if (!isAcceptableTagRedeclaration(ClassTemplate->getTemplatedDecl(), Kind,
+                                    TUK == TagUseKind::Definition, KWLoc,
                                     ClassTemplate->getIdentifier())) {
     Diag(KWLoc, diag::err_use_with_wrong_tag)
       << ClassTemplate
@@ -9617,7 +9619,7 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
 
   // Check whether we can declare a class template specialization in
   // the current scope.
-  if (TUK != TUK_Friend &&
+  if (TUK != TagUseKind::Friend &&
       CheckTemplateSpecializationScope(*this, ClassTemplate, PrevDecl,
                                        TemplateNameLoc,
                                        isPartialSpecialization))
@@ -9644,8 +9646,8 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
       // This rule has since been removed, because it's redundant given DR1495,
       // but we keep it because it produces better diagnostics and recovery.
       Diag(TemplateNameLoc, diag::err_partial_spec_args_match_primary_template)
-        << /*class template*/0 << (TUK == TUK_Definition)
-        << FixItHint::CreateRemoval(SourceRange(LAngleLoc, RAngleLoc));
+          << /*class template*/ 0 << (TUK == TagUseKind::Definition)
+          << FixItHint::CreateRemoval(SourceRange(LAngleLoc, RAngleLoc));
       return CheckClassTemplate(S, TagSpec, TUK, KWLoc, SS,
                                 ClassTemplate->getIdentifier(),
                                 TemplateNameLoc,
@@ -9737,11 +9739,11 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
   }
 
   // If this is not a friend, note that this is an explicit specialization.
-  if (TUK != TUK_Friend)
+  if (TUK != TagUseKind::Friend)
     Specialization->setSpecializationKind(TSK_ExplicitSpecialization);
 
   // Check that this isn't a redefinition of this specialization.
-  if (TUK == TUK_Definition) {
+  if (TUK == TagUseKind::Definition) {
     RecordDecl *Def = Specialization->getDefinition();
     NamedDecl *Hidden = nullptr;
     if (Def && SkipBody && !hasVisibleDefinition(Def, &Hidden)) {
@@ -9762,7 +9764,7 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
 
   // Add alignment attributes if necessary; these attributes are checked when
   // the ASTContext lays out the structure.
-  if (TUK == TUK_Definition && (!SkipBody || !SkipBody->ShouldSkip)) {
+  if (TUK == TagUseKind::Definition && (!SkipBody || !SkipBody->ShouldSkip)) {
     AddAlignmentAttributesForRecord(Specialization);
     AddMsStructLayoutForRecord(Specialization);
   }
@@ -9783,10 +9785,10 @@ DeclResult Sema::ActOnClassTemplateSpecialization(
   Specialization->setLexicalDeclContext(CurContext);
 
   // We may be starting the definition of this specialization.
-  if (TUK == TUK_Definition && (!SkipBody || !SkipBody->ShouldSkip))
+  if (TUK == TagUseKind::Definition && (!SkipBody || !SkipBody->ShouldSkip))
     Specialization->startDefinition();
 
-  if (TUK == TUK_Friend) {
+  if (TUK == TagUseKind::Friend) {
     // Build the fully-sugared type for this class template
     // specialization as the user wrote in the specialization
     // itself. This means that we'll pretty-print the type retrieved
@@ -11160,11 +11162,13 @@ Sema::ActOnExplicitInstantiation(Scope *S, SourceLocation ExternLoc,
 
   bool Owned = false;
   bool IsDependent = false;
-  Decl *TagD = ActOnTag(S, TagSpec, Sema::TUK_Reference, KWLoc, SS, Name,
-               NameLoc, Attr, AS_none, /*ModulePrivateLoc=*/SourceLocation(),
+  Decl *TagD =
+      ActOnTag(S, TagSpec, TagUseKind::Reference, KWLoc, SS, Name, NameLoc,
+               Attr, AS_none, /*ModulePrivateLoc=*/SourceLocation(),
                MultiTemplateParamsArg(), Owned, IsDependent, SourceLocation(),
                false, TypeResult(), /*IsTypeSpecifier*/ false,
-               /*IsTemplateParamOrArg*/ false, /*OOK=*/OOK_Outside).get();
+               /*IsTemplateParamOrArg*/ false, /*OOK=*/OOK_Outside)
+          .get();
   assert(!IsDependent && "explicit instantiation of dependent name not yet handled");
 
   if (!TagD)
@@ -11695,9 +11699,9 @@ TypeResult Sema::ActOnDependentTag(Scope *S, unsigned TagSpec, TagUseKind TUK,
 
   TagTypeKind Kind = TypeWithKeyword::getTagTypeKindForTypeSpec(TagSpec);
 
-  if (TUK == TUK_Declaration || TUK == TUK_Definition) {
+  if (TUK == TagUseKind::Declaration || TUK == TagUseKind::Definition) {
     Diag(NameLoc, diag::err_dependent_tag_decl)
-        << (TUK == TUK_Definition) << llvm::to_underlying(Kind)
+        << (TUK == TagUseKind::Definition) << llvm::to_underlying(Kind)
         << SS.getRange();
     return true;
   }
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 41fd210f29d0..f9ec34163e65 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -519,18 +519,14 @@ static NamedDecl *getTemplateParameterWithDefault(Sema &S, NamedDecl *A,
   switch (A->getKind()) {
   case Decl::TemplateTypeParm: {
     auto *T = cast<TemplateTypeParmDecl>(A);
-    // FIXME: A TemplateTypeParmDecl's DefaultArgument can't hold a full
-    // TemplateArgument, so there is currently no way to specify a pack as a
-    // default argument for these.
-    if (T->isParameterPack())
-      return A;
     auto *R = TemplateTypeParmDecl::Create(
         S.Context, A->getDeclContext(), SourceLocation(), SourceLocation(),
         T->getDepth(), T->getIndex(), T->getIdentifier(),
-        T->wasDeclaredWithTypename(), /*ParameterPack=*/false,
+        T->wasDeclaredWithTypename(), T->isParameterPack(),
         T->hasTypeConstraint());
     R->setDefaultArgument(
-        S.Context.getTrivialTypeSourceInfo(Default.getAsType()));
+        S.Context,
+        S.getTrivialTemplateArgumentLoc(Default, QualType(), SourceLocation()));
     if (R->hasTypeConstraint()) {
       auto *C = R->getTypeConstraint();
       R->setTypeConstraint(C->getConceptReference(),
@@ -540,14 +536,14 @@ static NamedDecl *getTemplateParameterWithDefault(Sema &S, NamedDecl *A,
   }
   case Decl::NonTypeTemplateParm: {
     auto *T = cast<NonTypeTemplateParmDecl>(A);
-    // FIXME: Ditto, as above for TemplateTypeParm case.
-    if (T->isParameterPack())
-      return A;
     auto *R = NonTypeTemplateParmDecl::Create(
         S.Context, A->getDeclContext(), SourceLocation(), SourceLocation(),
         T->getDepth(), T->getIndex(), T->getIdentifier(), T->getType(),
-        /*ParameterPack=*/false, T->getTypeSourceInfo());
-    R->setDefaultArgument(Default.getAsExpr());
+        T->isParameterPack(), T->getTypeSourceInfo());
+    R->setDefaultArgument(S.Context,
+                          S.getTrivialTemplateArgumentLoc(
+                              Default, Default.getNonTypeTemplateArgumentType(),
+                              SourceLocation()));
     if (auto *PTC = T->getPlaceholderTypeConstraint())
       R->setPlaceholderTypeConstraint(PTC);
     return R;
@@ -4776,8 +4772,13 @@ TemplateDeductionResult Sema::DeduceTemplateArguments(
       DeduceReturnType(Specialization, Info.getLocation(), false))
     return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
+  // [C++26][expr.const]/p17
+  // An expression or conversion is immediate-escalating if it is not initially
+  // in an immediate function context and it is [...]
+  // a potentially-evaluated id-expression that denotes an immediate function.
   if (IsAddressOfFunction && getLangOpts().CPlusPlus20 &&
       Specialization->isImmediateEscalating() &&
+      parentEvaluationContext().isPotentiallyEvaluated() &&
       CheckIfFunctionSpecializationIsImmediate(Specialization,
                                                Info.getLocation()))
     return TemplateDeductionResult::MiscellaneousDeductionFailure;
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 07626058c797..abb8a260faab 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -1619,11 +1619,6 @@ namespace {
       case TemplateArgument::Pack:
         // Literally rewrite the template argument pack, instead of unpacking
         // it.
-        assert(
-            SemaRef.CodeSynthesisContexts.back().Kind ==
-                Sema::CodeSynthesisContext::BuildingDeductionGuides &&
-            "Transforming a template argument pack is only allowed in building "
-            "deduction guide");
         for (auto &pack : Arg.getPackAsArray()) {
           TemplateArgumentLoc Input = SemaRef.getTrivialTemplateArgumentLoc(
               pack, QualType(), SourceLocation{});
@@ -4375,9 +4370,9 @@ Sema::SubstStmt(Stmt *S, const MultiLevelTemplateArgumentList &TemplateArgs) {
 bool Sema::SubstTemplateArgument(
     const TemplateArgumentLoc &Input,
     const MultiLevelTemplateArgumentList &TemplateArgs,
-    TemplateArgumentLoc &Output) {
-  TemplateInstantiator Instantiator(*this, TemplateArgs, SourceLocation(),
-                                    DeclarationName());
+    TemplateArgumentLoc &Output, SourceLocation Loc,
+    const DeclarationName &Entity) {
+  TemplateInstantiator Instantiator(*this, TemplateArgs, Loc, Entity);
   return Instantiator.TransformTemplateArgument(Input, Output);
 }
 
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 381d79b2fcd4..bb49aae2cb66 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -2956,11 +2956,10 @@ Decl *TemplateDeclInstantiator::VisitTemplateTypeParmDecl(
     }
   }
   if (D->hasDefaultArgument() && !D->defaultArgumentWasInherited()) {
-    TypeSourceInfo *InstantiatedDefaultArg =
-        SemaRef.SubstType(D->getDefaultArgumentInfo(), TemplateArgs,
-                          D->getDefaultArgumentLoc(), D->getDeclName());
-    if (InstantiatedDefaultArg)
-      Inst->setDefaultArgument(InstantiatedDefaultArg);
+    TemplateArgumentLoc Output;
+    if (!SemaRef.SubstTemplateArgument(D->getDefaultArgument(), TemplateArgs,
+                                       Output))
+      Inst->setDefaultArgument(SemaRef.getASTContext(), Output);
   }
 
   // Introduce this template parameter's instantiation into the instantiation
@@ -3124,9 +3123,10 @@ Decl *TemplateDeclInstantiator::VisitNonTypeTemplateParmDecl(
   if (D->hasDefaultArgument() && !D->defaultArgumentWasInherited()) {
     EnterExpressionEvaluationContext ConstantEvaluated(
         SemaRef, Sema::ExpressionEvaluationContext::ConstantEvaluated);
-    ExprResult Value = SemaRef.SubstExpr(D->getDefaultArgument(), TemplateArgs);
-    if (!Value.isInvalid())
-      Param->setDefaultArgument(Value.get());
+    TemplateArgumentLoc Result;
+    if (!SemaRef.SubstTemplateArgument(D->getDefaultArgument(), TemplateArgs,
+                                       Result))
+      Param->setDefaultArgument(SemaRef.Context, Result);
   }
 
   // Introduce this template parameter's instantiation into the instantiation
@@ -5055,6 +5055,7 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation,
   Function->setLocation(PatternDecl->getLocation());
   Function->setInnerLocStart(PatternDecl->getInnerLocStart());
   Function->setRangeEnd(PatternDecl->getEndLoc());
+  Function->setDeclarationNameLoc(PatternDecl->getNameInfo().getInfo());
 
   EnterExpressionEvaluationContext EvalContext(
       *this, Sema::ExpressionEvaluationContext::PotentiallyEvaluated);
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index c19c8cc34dd3..ef0b6b701a52 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -9345,9 +9345,9 @@ BuildTypeCoupledDecls(Expr *E,
   Decls.push_back(TypeCoupledDeclRefInfo(CountDecl, /*IsDref*/ false));
 }
 
-QualType Sema::BuildCountAttributedArrayType(QualType WrappedTy,
-                                             Expr *CountExpr) {
-  assert(WrappedTy->isIncompleteArrayType());
+QualType Sema::BuildCountAttributedArrayOrPointerType(QualType WrappedTy,
+                                                      Expr *CountExpr) {
+  assert(WrappedTy->isIncompleteArrayType() || WrappedTy->isPointerType());
 
   llvm::SmallVector<TypeCoupledDeclRefInfo, 1> Decls;
   BuildTypeCoupledDecls(CountExpr, Decls);
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
new file mode 100644
index 000000000000..ffac1afc5d78
--- /dev/null
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -0,0 +1,878 @@
+//===------ SemaX86.cpp ---------- X86 target-specific routines -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file implements semantic analysis functions specific to X86.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Sema/SemaX86.h"
+#include "clang/Basic/DiagnosticSema.h"
+#include "clang/Basic/TargetBuiltins.h"
+#include "clang/Sema/Sema.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/TargetParser/Triple.h"
+#include <bitset>
+
+namespace clang {
+
+SemaX86::SemaX86(Sema &S) : SemaBase(S) {}
+
+// Check if the rounding mode is legal.
+bool SemaX86::CheckBuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
+  // Indicates if this instruction has rounding control or just SAE.
+  bool HasRC = false;
+
+  unsigned ArgNum = 0;
+  switch (BuiltinID) {
+  default:
+    return false;
+  case X86::BI__builtin_ia32_vcvttsd2si32:
+  case X86::BI__builtin_ia32_vcvttsd2si64:
+  case X86::BI__builtin_ia32_vcvttsd2usi32:
+  case X86::BI__builtin_ia32_vcvttsd2usi64:
+  case X86::BI__builtin_ia32_vcvttss2si32:
+  case X86::BI__builtin_ia32_vcvttss2si64:
+  case X86::BI__builtin_ia32_vcvttss2usi32:
+  case X86::BI__builtin_ia32_vcvttss2usi64:
+  case X86::BI__builtin_ia32_vcvttsh2si32:
+  case X86::BI__builtin_ia32_vcvttsh2si64:
+  case X86::BI__builtin_ia32_vcvttsh2usi32:
+  case X86::BI__builtin_ia32_vcvttsh2usi64:
+    ArgNum = 1;
+    break;
+  case X86::BI__builtin_ia32_maxpd512:
+  case X86::BI__builtin_ia32_maxps512:
+  case X86::BI__builtin_ia32_minpd512:
+  case X86::BI__builtin_ia32_minps512:
+  case X86::BI__builtin_ia32_maxph512:
+  case X86::BI__builtin_ia32_minph512:
+    ArgNum = 2;
+    break;
+  case X86::BI__builtin_ia32_vcvtph2pd512_mask:
+  case X86::BI__builtin_ia32_vcvtph2psx512_mask:
+  case X86::BI__builtin_ia32_cvtps2pd512_mask:
+  case X86::BI__builtin_ia32_cvttpd2dq512_mask:
+  case X86::BI__builtin_ia32_cvttpd2qq512_mask:
+  case X86::BI__builtin_ia32_cvttpd2udq512_mask:
+  case X86::BI__builtin_ia32_cvttpd2uqq512_mask:
+  case X86::BI__builtin_ia32_cvttps2dq512_mask:
+  case X86::BI__builtin_ia32_cvttps2qq512_mask:
+  case X86::BI__builtin_ia32_cvttps2udq512_mask:
+  case X86::BI__builtin_ia32_cvttps2uqq512_mask:
+  case X86::BI__builtin_ia32_vcvttph2w512_mask:
+  case X86::BI__builtin_ia32_vcvttph2uw512_mask:
+  case X86::BI__builtin_ia32_vcvttph2dq512_mask:
+  case X86::BI__builtin_ia32_vcvttph2udq512_mask:
+  case X86::BI__builtin_ia32_vcvttph2qq512_mask:
+  case X86::BI__builtin_ia32_vcvttph2uqq512_mask:
+  case X86::BI__builtin_ia32_getexppd512_mask:
+  case X86::BI__builtin_ia32_getexpps512_mask:
+  case X86::BI__builtin_ia32_getexpph512_mask:
+  case X86::BI__builtin_ia32_vcomisd:
+  case X86::BI__builtin_ia32_vcomiss:
+  case X86::BI__builtin_ia32_vcomish:
+  case X86::BI__builtin_ia32_vcvtph2ps512_mask:
+    ArgNum = 3;
+    break;
+  case X86::BI__builtin_ia32_cmppd512_mask:
+  case X86::BI__builtin_ia32_cmpps512_mask:
+  case X86::BI__builtin_ia32_cmpsd_mask:
+  case X86::BI__builtin_ia32_cmpss_mask:
+  case X86::BI__builtin_ia32_cmpsh_mask:
+  case X86::BI__builtin_ia32_vcvtsh2sd_round_mask:
+  case X86::BI__builtin_ia32_vcvtsh2ss_round_mask:
+  case X86::BI__builtin_ia32_cvtss2sd_round_mask:
+  case X86::BI__builtin_ia32_getexpsd128_round_mask:
+  case X86::BI__builtin_ia32_getexpss128_round_mask:
+  case X86::BI__builtin_ia32_getexpsh128_round_mask:
+  case X86::BI__builtin_ia32_getmantpd512_mask:
+  case X86::BI__builtin_ia32_getmantps512_mask:
+  case X86::BI__builtin_ia32_getmantph512_mask:
+  case X86::BI__builtin_ia32_maxsd_round_mask:
+  case X86::BI__builtin_ia32_maxss_round_mask:
+  case X86::BI__builtin_ia32_maxsh_round_mask:
+  case X86::BI__builtin_ia32_minsd_round_mask:
+  case X86::BI__builtin_ia32_minss_round_mask:
+  case X86::BI__builtin_ia32_minsh_round_mask:
+  case X86::BI__builtin_ia32_reducepd512_mask:
+  case X86::BI__builtin_ia32_reduceps512_mask:
+  case X86::BI__builtin_ia32_reduceph512_mask:
+  case X86::BI__builtin_ia32_rndscalepd_mask:
+  case X86::BI__builtin_ia32_rndscaleps_mask:
+  case X86::BI__builtin_ia32_rndscaleph_mask:
+    ArgNum = 4;
+    break;
+  case X86::BI__builtin_ia32_fixupimmpd512_mask:
+  case X86::BI__builtin_ia32_fixupimmpd512_maskz:
+  case X86::BI__builtin_ia32_fixupimmps512_mask:
+  case X86::BI__builtin_ia32_fixupimmps512_maskz:
+  case X86::BI__builtin_ia32_fixupimmsd_mask:
+  case X86::BI__builtin_ia32_fixupimmsd_maskz:
+  case X86::BI__builtin_ia32_fixupimmss_mask:
+  case X86::BI__builtin_ia32_fixupimmss_maskz:
+  case X86::BI__builtin_ia32_getmantsd_round_mask:
+  case X86::BI__builtin_ia32_getmantss_round_mask:
+  case X86::BI__builtin_ia32_getmantsh_round_mask:
+  case X86::BI__builtin_ia32_rangepd512_mask:
+  case X86::BI__builtin_ia32_rangeps512_mask:
+  case X86::BI__builtin_ia32_rangesd128_round_mask:
+  case X86::BI__builtin_ia32_rangess128_round_mask:
+  case X86::BI__builtin_ia32_reducesd_mask:
+  case X86::BI__builtin_ia32_reducess_mask:
+  case X86::BI__builtin_ia32_reducesh_mask:
+  case X86::BI__builtin_ia32_rndscalesd_round_mask:
+  case X86::BI__builtin_ia32_rndscaless_round_mask:
+  case X86::BI__builtin_ia32_rndscalesh_round_mask:
+    ArgNum = 5;
+    break;
+  case X86::BI__builtin_ia32_vcvtsd2si64:
+  case X86::BI__builtin_ia32_vcvtsd2si32:
+  case X86::BI__builtin_ia32_vcvtsd2usi32:
+  case X86::BI__builtin_ia32_vcvtsd2usi64:
+  case X86::BI__builtin_ia32_vcvtss2si32:
+  case X86::BI__builtin_ia32_vcvtss2si64:
+  case X86::BI__builtin_ia32_vcvtss2usi32:
+  case X86::BI__builtin_ia32_vcvtss2usi64:
+  case X86::BI__builtin_ia32_vcvtsh2si32:
+  case X86::BI__builtin_ia32_vcvtsh2si64:
+  case X86::BI__builtin_ia32_vcvtsh2usi32:
+  case X86::BI__builtin_ia32_vcvtsh2usi64:
+  case X86::BI__builtin_ia32_sqrtpd512:
+  case X86::BI__builtin_ia32_sqrtps512:
+  case X86::BI__builtin_ia32_sqrtph512:
+    ArgNum = 1;
+    HasRC = true;
+    break;
+  case X86::BI__builtin_ia32_addph512:
+  case X86::BI__builtin_ia32_divph512:
+  case X86::BI__builtin_ia32_mulph512:
+  case X86::BI__builtin_ia32_subph512:
+  case X86::BI__builtin_ia32_addpd512:
+  case X86::BI__builtin_ia32_addps512:
+  case X86::BI__builtin_ia32_divpd512:
+  case X86::BI__builtin_ia32_divps512:
+  case X86::BI__builtin_ia32_mulpd512:
+  case X86::BI__builtin_ia32_mulps512:
+  case X86::BI__builtin_ia32_subpd512:
+  case X86::BI__builtin_ia32_subps512:
+  case X86::BI__builtin_ia32_cvtsi2sd64:
+  case X86::BI__builtin_ia32_cvtsi2ss32:
+  case X86::BI__builtin_ia32_cvtsi2ss64:
+  case X86::BI__builtin_ia32_cvtusi2sd64:
+  case X86::BI__builtin_ia32_cvtusi2ss32:
+  case X86::BI__builtin_ia32_cvtusi2ss64:
+  case X86::BI__builtin_ia32_vcvtusi2sh:
+  case X86::BI__builtin_ia32_vcvtusi642sh:
+  case X86::BI__builtin_ia32_vcvtsi2sh:
+  case X86::BI__builtin_ia32_vcvtsi642sh:
+    ArgNum = 2;
+    HasRC = true;
+    break;
+  case X86::BI__builtin_ia32_cvtdq2ps512_mask:
+  case X86::BI__builtin_ia32_cvtudq2ps512_mask:
+  case X86::BI__builtin_ia32_vcvtpd2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtps2phx512_mask:
+  case X86::BI__builtin_ia32_cvtpd2ps512_mask:
+  case X86::BI__builtin_ia32_cvtpd2dq512_mask:
+  case X86::BI__builtin_ia32_cvtpd2qq512_mask:
+  case X86::BI__builtin_ia32_cvtpd2udq512_mask:
+  case X86::BI__builtin_ia32_cvtpd2uqq512_mask:
+  case X86::BI__builtin_ia32_cvtps2dq512_mask:
+  case X86::BI__builtin_ia32_cvtps2qq512_mask:
+  case X86::BI__builtin_ia32_cvtps2udq512_mask:
+  case X86::BI__builtin_ia32_cvtps2uqq512_mask:
+  case X86::BI__builtin_ia32_cvtqq2pd512_mask:
+  case X86::BI__builtin_ia32_cvtqq2ps512_mask:
+  case X86::BI__builtin_ia32_cvtuqq2pd512_mask:
+  case X86::BI__builtin_ia32_cvtuqq2ps512_mask:
+  case X86::BI__builtin_ia32_vcvtdq2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtudq2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtw2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtuw2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtph2w512_mask:
+  case X86::BI__builtin_ia32_vcvtph2uw512_mask:
+  case X86::BI__builtin_ia32_vcvtph2dq512_mask:
+  case X86::BI__builtin_ia32_vcvtph2udq512_mask:
+  case X86::BI__builtin_ia32_vcvtph2qq512_mask:
+  case X86::BI__builtin_ia32_vcvtph2uqq512_mask:
+  case X86::BI__builtin_ia32_vcvtqq2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
+    ArgNum = 3;
+    HasRC = true;
+    break;
+  case X86::BI__builtin_ia32_addsh_round_mask:
+  case X86::BI__builtin_ia32_addss_round_mask:
+  case X86::BI__builtin_ia32_addsd_round_mask:
+  case X86::BI__builtin_ia32_divsh_round_mask:
+  case X86::BI__builtin_ia32_divss_round_mask:
+  case X86::BI__builtin_ia32_divsd_round_mask:
+  case X86::BI__builtin_ia32_mulsh_round_mask:
+  case X86::BI__builtin_ia32_mulss_round_mask:
+  case X86::BI__builtin_ia32_mulsd_round_mask:
+  case X86::BI__builtin_ia32_subsh_round_mask:
+  case X86::BI__builtin_ia32_subss_round_mask:
+  case X86::BI__builtin_ia32_subsd_round_mask:
+  case X86::BI__builtin_ia32_scalefph512_mask:
+  case X86::BI__builtin_ia32_scalefpd512_mask:
+  case X86::BI__builtin_ia32_scalefps512_mask:
+  case X86::BI__builtin_ia32_scalefsd_round_mask:
+  case X86::BI__builtin_ia32_scalefss_round_mask:
+  case X86::BI__builtin_ia32_scalefsh_round_mask:
+  case X86::BI__builtin_ia32_cvtsd2ss_round_mask:
+  case X86::BI__builtin_ia32_vcvtss2sh_round_mask:
+  case X86::BI__builtin_ia32_vcvtsd2sh_round_mask:
+  case X86::BI__builtin_ia32_sqrtsd_round_mask:
+  case X86::BI__builtin_ia32_sqrtss_round_mask:
+  case X86::BI__builtin_ia32_sqrtsh_round_mask:
+  case X86::BI__builtin_ia32_vfmaddsd3_mask:
+  case X86::BI__builtin_ia32_vfmaddsd3_maskz:
+  case X86::BI__builtin_ia32_vfmaddsd3_mask3:
+  case X86::BI__builtin_ia32_vfmaddss3_mask:
+  case X86::BI__builtin_ia32_vfmaddss3_maskz:
+  case X86::BI__builtin_ia32_vfmaddss3_mask3:
+  case X86::BI__builtin_ia32_vfmaddsh3_mask:
+  case X86::BI__builtin_ia32_vfmaddsh3_maskz:
+  case X86::BI__builtin_ia32_vfmaddsh3_mask3:
+  case X86::BI__builtin_ia32_vfmaddpd512_mask:
+  case X86::BI__builtin_ia32_vfmaddpd512_maskz:
+  case X86::BI__builtin_ia32_vfmaddpd512_mask3:
+  case X86::BI__builtin_ia32_vfmsubpd512_mask3:
+  case X86::BI__builtin_ia32_vfmaddps512_mask:
+  case X86::BI__builtin_ia32_vfmaddps512_maskz:
+  case X86::BI__builtin_ia32_vfmaddps512_mask3:
+  case X86::BI__builtin_ia32_vfmsubps512_mask3:
+  case X86::BI__builtin_ia32_vfmaddph512_mask:
+  case X86::BI__builtin_ia32_vfmaddph512_maskz:
+  case X86::BI__builtin_ia32_vfmaddph512_mask3:
+  case X86::BI__builtin_ia32_vfmsubph512_mask3:
+  case X86::BI__builtin_ia32_vfmaddsubpd512_mask:
+  case X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
+  case X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
+  case X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
+  case X86::BI__builtin_ia32_vfmaddsubps512_mask:
+  case X86::BI__builtin_ia32_vfmaddsubps512_maskz:
+  case X86::BI__builtin_ia32_vfmaddsubps512_mask3:
+  case X86::BI__builtin_ia32_vfmsubaddps512_mask3:
+  case X86::BI__builtin_ia32_vfmaddsubph512_mask:
+  case X86::BI__builtin_ia32_vfmaddsubph512_maskz:
+  case X86::BI__builtin_ia32_vfmaddsubph512_mask3:
+  case X86::BI__builtin_ia32_vfmsubaddph512_mask3:
+  case X86::BI__builtin_ia32_vfmaddcsh_mask:
+  case X86::BI__builtin_ia32_vfmaddcsh_round_mask:
+  case X86::BI__builtin_ia32_vfmaddcsh_round_mask3:
+  case X86::BI__builtin_ia32_vfmaddcph512_mask:
+  case X86::BI__builtin_ia32_vfmaddcph512_maskz:
+  case X86::BI__builtin_ia32_vfmaddcph512_mask3:
+  case X86::BI__builtin_ia32_vfcmaddcsh_mask:
+  case X86::BI__builtin_ia32_vfcmaddcsh_round_mask:
+  case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3:
+  case X86::BI__builtin_ia32_vfcmaddcph512_mask:
+  case X86::BI__builtin_ia32_vfcmaddcph512_maskz:
+  case X86::BI__builtin_ia32_vfcmaddcph512_mask3:
+  case X86::BI__builtin_ia32_vfmulcsh_mask:
+  case X86::BI__builtin_ia32_vfmulcph512_mask:
+  case X86::BI__builtin_ia32_vfcmulcsh_mask:
+  case X86::BI__builtin_ia32_vfcmulcph512_mask:
+    ArgNum = 4;
+    HasRC = true;
+    break;
+  }
+
+  llvm::APSInt Result;
+
+  // We can't check the value of a dependent argument.
+  Expr *Arg = TheCall->getArg(ArgNum);
+  if (Arg->isTypeDependent() || Arg->isValueDependent())
+    return false;
+
+  // Check constant-ness first.
+  if (SemaRef.BuiltinConstantArg(TheCall, ArgNum, Result))
+    return true;
+
+  // Make sure rounding mode is either ROUND_CUR_DIRECTION or ROUND_NO_EXC bit
+  // is set. If the intrinsic has rounding control(bits 1:0), make sure its only
+  // combined with ROUND_NO_EXC. If the intrinsic does not have rounding
+  // control, allow ROUND_NO_EXC and ROUND_CUR_DIRECTION together.
+  if (Result == 4 /*ROUND_CUR_DIRECTION*/ || Result == 8 /*ROUND_NO_EXC*/ ||
+      (!HasRC && Result == 12 /*ROUND_CUR_DIRECTION|ROUND_NO_EXC*/) ||
+      (HasRC && Result.getZExtValue() >= 8 && Result.getZExtValue() <= 11))
+    return false;
+
+  return Diag(TheCall->getBeginLoc(), diag::err_x86_builtin_invalid_rounding)
+         << Arg->getSourceRange();
+}
+
+// Check if the gather/scatter scale is legal.
+bool SemaX86::CheckBuiltinGatherScatterScale(unsigned BuiltinID,
+                                             CallExpr *TheCall) {
+  unsigned ArgNum = 0;
+  switch (BuiltinID) {
+  default:
+    return false;
+  case X86::BI__builtin_ia32_gatherd_pd:
+  case X86::BI__builtin_ia32_gatherd_pd256:
+  case X86::BI__builtin_ia32_gatherq_pd:
+  case X86::BI__builtin_ia32_gatherq_pd256:
+  case X86::BI__builtin_ia32_gatherd_ps:
+  case X86::BI__builtin_ia32_gatherd_ps256:
+  case X86::BI__builtin_ia32_gatherq_ps:
+  case X86::BI__builtin_ia32_gatherq_ps256:
+  case X86::BI__builtin_ia32_gatherd_q:
+  case X86::BI__builtin_ia32_gatherd_q256:
+  case X86::BI__builtin_ia32_gatherq_q:
+  case X86::BI__builtin_ia32_gatherq_q256:
+  case X86::BI__builtin_ia32_gatherd_d:
+  case X86::BI__builtin_ia32_gatherd_d256:
+  case X86::BI__builtin_ia32_gatherq_d:
+  case X86::BI__builtin_ia32_gatherq_d256:
+  case X86::BI__builtin_ia32_gather3div2df:
+  case X86::BI__builtin_ia32_gather3div2di:
+  case X86::BI__builtin_ia32_gather3div4df:
+  case X86::BI__builtin_ia32_gather3div4di:
+  case X86::BI__builtin_ia32_gather3div4sf:
+  case X86::BI__builtin_ia32_gather3div4si:
+  case X86::BI__builtin_ia32_gather3div8sf:
+  case X86::BI__builtin_ia32_gather3div8si:
+  case X86::BI__builtin_ia32_gather3siv2df:
+  case X86::BI__builtin_ia32_gather3siv2di:
+  case X86::BI__builtin_ia32_gather3siv4df:
+  case X86::BI__builtin_ia32_gather3siv4di:
+  case X86::BI__builtin_ia32_gather3siv4sf:
+  case X86::BI__builtin_ia32_gather3siv4si:
+  case X86::BI__builtin_ia32_gather3siv8sf:
+  case X86::BI__builtin_ia32_gather3siv8si:
+  case X86::BI__builtin_ia32_gathersiv8df:
+  case X86::BI__builtin_ia32_gathersiv16sf:
+  case X86::BI__builtin_ia32_gatherdiv8df:
+  case X86::BI__builtin_ia32_gatherdiv16sf:
+  case X86::BI__builtin_ia32_gathersiv8di:
+  case X86::BI__builtin_ia32_gathersiv16si:
+  case X86::BI__builtin_ia32_gatherdiv8di:
+  case X86::BI__builtin_ia32_gatherdiv16si:
+  case X86::BI__builtin_ia32_scatterdiv2df:
+  case X86::BI__builtin_ia32_scatterdiv2di:
+  case X86::BI__builtin_ia32_scatterdiv4df:
+  case X86::BI__builtin_ia32_scatterdiv4di:
+  case X86::BI__builtin_ia32_scatterdiv4sf:
+  case X86::BI__builtin_ia32_scatterdiv4si:
+  case X86::BI__builtin_ia32_scatterdiv8sf:
+  case X86::BI__builtin_ia32_scatterdiv8si:
+  case X86::BI__builtin_ia32_scattersiv2df:
+  case X86::BI__builtin_ia32_scattersiv2di:
+  case X86::BI__builtin_ia32_scattersiv4df:
+  case X86::BI__builtin_ia32_scattersiv4di:
+  case X86::BI__builtin_ia32_scattersiv4sf:
+  case X86::BI__builtin_ia32_scattersiv4si:
+  case X86::BI__builtin_ia32_scattersiv8sf:
+  case X86::BI__builtin_ia32_scattersiv8si:
+  case X86::BI__builtin_ia32_scattersiv8df:
+  case X86::BI__builtin_ia32_scattersiv16sf:
+  case X86::BI__builtin_ia32_scatterdiv8df:
+  case X86::BI__builtin_ia32_scatterdiv16sf:
+  case X86::BI__builtin_ia32_scattersiv8di:
+  case X86::BI__builtin_ia32_scattersiv16si:
+  case X86::BI__builtin_ia32_scatterdiv8di:
+  case X86::BI__builtin_ia32_scatterdiv16si:
+    ArgNum = 4;
+    break;
+  }
+
+  llvm::APSInt Result;
+
+  // We can't check the value of a dependent argument.
+  Expr *Arg = TheCall->getArg(ArgNum);
+  if (Arg->isTypeDependent() || Arg->isValueDependent())
+    return false;
+
+  // Check constant-ness first.
+  if (SemaRef.BuiltinConstantArg(TheCall, ArgNum, Result))
+    return true;
+
+  if (Result == 1 || Result == 2 || Result == 4 || Result == 8)
+    return false;
+
+  return Diag(TheCall->getBeginLoc(), diag::err_x86_builtin_invalid_scale)
+         << Arg->getSourceRange();
+}
+
+enum { TileRegLow = 0, TileRegHigh = 7 };
+
+bool SemaX86::CheckBuiltinTileArgumentsRange(CallExpr *TheCall,
+                                             ArrayRef<int> ArgNums) {
+  for (int ArgNum : ArgNums) {
+    if (SemaRef.BuiltinConstantArgRange(TheCall, ArgNum, TileRegLow,
+                                        TileRegHigh))
+      return true;
+  }
+  return false;
+}
+
+bool SemaX86::CheckBuiltinTileDuplicate(CallExpr *TheCall,
+                                        ArrayRef<int> ArgNums) {
+  // Because the max number of tile register is TileRegHigh + 1, so here we use
+  // each bit to represent the usage of them in bitset.
+  std::bitset<TileRegHigh + 1> ArgValues;
+  for (int ArgNum : ArgNums) {
+    Expr *Arg = TheCall->getArg(ArgNum);
+    if (Arg->isTypeDependent() || Arg->isValueDependent())
+      continue;
+
+    llvm::APSInt Result;
+    if (SemaRef.BuiltinConstantArg(TheCall, ArgNum, Result))
+      return true;
+    int ArgExtValue = Result.getExtValue();
+    assert((ArgExtValue >= TileRegLow && ArgExtValue <= TileRegHigh) &&
+           "Incorrect tile register num.");
+    if (ArgValues.test(ArgExtValue))
+      return Diag(TheCall->getBeginLoc(),
+                  diag::err_x86_builtin_tile_arg_duplicate)
+             << TheCall->getArg(ArgNum)->getSourceRange();
+    ArgValues.set(ArgExtValue);
+  }
+  return false;
+}
+
+bool SemaX86::CheckBuiltinTileRangeAndDuplicate(CallExpr *TheCall,
+                                                ArrayRef<int> ArgNums) {
+  return CheckBuiltinTileArgumentsRange(TheCall, ArgNums) ||
+         CheckBuiltinTileDuplicate(TheCall, ArgNums);
+}
+
+bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
+  switch (BuiltinID) {
+  default:
+    return false;
+  case X86::BI__builtin_ia32_tileloadd64:
+  case X86::BI__builtin_ia32_tileloaddt164:
+  case X86::BI__builtin_ia32_tilestored64:
+  case X86::BI__builtin_ia32_tilezero:
+    return CheckBuiltinTileArgumentsRange(TheCall, 0);
+  case X86::BI__builtin_ia32_tdpbssd:
+  case X86::BI__builtin_ia32_tdpbsud:
+  case X86::BI__builtin_ia32_tdpbusd:
+  case X86::BI__builtin_ia32_tdpbuud:
+  case X86::BI__builtin_ia32_tdpbf16ps:
+  case X86::BI__builtin_ia32_tdpfp16ps:
+  case X86::BI__builtin_ia32_tcmmimfp16ps:
+  case X86::BI__builtin_ia32_tcmmrlfp16ps:
+    return CheckBuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2});
+  }
+}
+static bool isX86_32Builtin(unsigned BuiltinID) {
+  // These builtins only work on x86-32 targets.
+  switch (BuiltinID) {
+  case X86::BI__builtin_ia32_readeflags_u32:
+  case X86::BI__builtin_ia32_writeeflags_u32:
+    return true;
+  }
+
+  return false;
+}
+
+bool SemaX86::CheckBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
+                                       CallExpr *TheCall) {
+  // Check for 32-bit only builtins on a 64-bit target.
+  const llvm::Triple &TT = TI.getTriple();
+  if (TT.getArch() != llvm::Triple::x86 && isX86_32Builtin(BuiltinID))
+    return Diag(TheCall->getCallee()->getBeginLoc(),
+                diag::err_32_bit_builtin_64_bit_tgt);
+
+  // If the intrinsic has rounding or SAE make sure its valid.
+  if (CheckBuiltinRoundingOrSAE(BuiltinID, TheCall))
+    return true;
+
+  // If the intrinsic has a gather/scatter scale immediate make sure its valid.
+  if (CheckBuiltinGatherScatterScale(BuiltinID, TheCall))
+    return true;
+
+  // If the intrinsic has a tile arguments, make sure they are valid.
+  if (CheckBuiltinTileArguments(BuiltinID, TheCall))
+    return true;
+
+  // For intrinsics which take an immediate value as part of the instruction,
+  // range check them here.
+  int i = 0, l = 0, u = 0;
+  switch (BuiltinID) {
+  default:
+    return false;
+  case X86::BI__builtin_ia32_vec_ext_v2si:
+  case X86::BI__builtin_ia32_vec_ext_v2di:
+  case X86::BI__builtin_ia32_vextractf128_pd256:
+  case X86::BI__builtin_ia32_vextractf128_ps256:
+  case X86::BI__builtin_ia32_vextractf128_si256:
+  case X86::BI__builtin_ia32_extract128i256:
+  case X86::BI__builtin_ia32_extractf64x4_mask:
+  case X86::BI__builtin_ia32_extracti64x4_mask:
+  case X86::BI__builtin_ia32_extractf32x8_mask:
+  case X86::BI__builtin_ia32_extracti32x8_mask:
+  case X86::BI__builtin_ia32_extractf64x2_256_mask:
+  case X86::BI__builtin_ia32_extracti64x2_256_mask:
+  case X86::BI__builtin_ia32_extractf32x4_256_mask:
+  case X86::BI__builtin_ia32_extracti32x4_256_mask:
+    i = 1;
+    l = 0;
+    u = 1;
+    break;
+  case X86::BI__builtin_ia32_vec_set_v2di:
+  case X86::BI__builtin_ia32_vinsertf128_pd256:
+  case X86::BI__builtin_ia32_vinsertf128_ps256:
+  case X86::BI__builtin_ia32_vinsertf128_si256:
+  case X86::BI__builtin_ia32_insert128i256:
+  case X86::BI__builtin_ia32_insertf32x8:
+  case X86::BI__builtin_ia32_inserti32x8:
+  case X86::BI__builtin_ia32_insertf64x4:
+  case X86::BI__builtin_ia32_inserti64x4:
+  case X86::BI__builtin_ia32_insertf64x2_256:
+  case X86::BI__builtin_ia32_inserti64x2_256:
+  case X86::BI__builtin_ia32_insertf32x4_256:
+  case X86::BI__builtin_ia32_inserti32x4_256:
+    i = 2;
+    l = 0;
+    u = 1;
+    break;
+  case X86::BI__builtin_ia32_vpermilpd:
+  case X86::BI__builtin_ia32_vec_ext_v4hi:
+  case X86::BI__builtin_ia32_vec_ext_v4si:
+  case X86::BI__builtin_ia32_vec_ext_v4sf:
+  case X86::BI__builtin_ia32_vec_ext_v4di:
+  case X86::BI__builtin_ia32_extractf32x4_mask:
+  case X86::BI__builtin_ia32_extracti32x4_mask:
+  case X86::BI__builtin_ia32_extractf64x2_512_mask:
+  case X86::BI__builtin_ia32_extracti64x2_512_mask:
+    i = 1;
+    l = 0;
+    u = 3;
+    break;
+  case X86::BI_mm_prefetch:
+  case X86::BI__builtin_ia32_vec_ext_v8hi:
+  case X86::BI__builtin_ia32_vec_ext_v8si:
+    i = 1;
+    l = 0;
+    u = 7;
+    break;
+  case X86::BI__builtin_ia32_sha1rnds4:
+  case X86::BI__builtin_ia32_blendpd:
+  case X86::BI__builtin_ia32_shufpd:
+  case X86::BI__builtin_ia32_vec_set_v4hi:
+  case X86::BI__builtin_ia32_vec_set_v4si:
+  case X86::BI__builtin_ia32_vec_set_v4di:
+  case X86::BI__builtin_ia32_shuf_f32x4_256:
+  case X86::BI__builtin_ia32_shuf_f64x2_256:
+  case X86::BI__builtin_ia32_shuf_i32x4_256:
+  case X86::BI__builtin_ia32_shuf_i64x2_256:
+  case X86::BI__builtin_ia32_insertf64x2_512:
+  case X86::BI__builtin_ia32_inserti64x2_512:
+  case X86::BI__builtin_ia32_insertf32x4:
+  case X86::BI__builtin_ia32_inserti32x4:
+    i = 2;
+    l = 0;
+    u = 3;
+    break;
+  case X86::BI__builtin_ia32_vpermil2pd:
+  case X86::BI__builtin_ia32_vpermil2pd256:
+  case X86::BI__builtin_ia32_vpermil2ps:
+  case X86::BI__builtin_ia32_vpermil2ps256:
+    i = 3;
+    l = 0;
+    u = 3;
+    break;
+  case X86::BI__builtin_ia32_cmpb128_mask:
+  case X86::BI__builtin_ia32_cmpw128_mask:
+  case X86::BI__builtin_ia32_cmpd128_mask:
+  case X86::BI__builtin_ia32_cmpq128_mask:
+  case X86::BI__builtin_ia32_cmpb256_mask:
+  case X86::BI__builtin_ia32_cmpw256_mask:
+  case X86::BI__builtin_ia32_cmpd256_mask:
+  case X86::BI__builtin_ia32_cmpq256_mask:
+  case X86::BI__builtin_ia32_cmpb512_mask:
+  case X86::BI__builtin_ia32_cmpw512_mask:
+  case X86::BI__builtin_ia32_cmpd512_mask:
+  case X86::BI__builtin_ia32_cmpq512_mask:
+  case X86::BI__builtin_ia32_ucmpb128_mask:
+  case X86::BI__builtin_ia32_ucmpw128_mask:
+  case X86::BI__builtin_ia32_ucmpd128_mask:
+  case X86::BI__builtin_ia32_ucmpq128_mask:
+  case X86::BI__builtin_ia32_ucmpb256_mask:
+  case X86::BI__builtin_ia32_ucmpw256_mask:
+  case X86::BI__builtin_ia32_ucmpd256_mask:
+  case X86::BI__builtin_ia32_ucmpq256_mask:
+  case X86::BI__builtin_ia32_ucmpb512_mask:
+  case X86::BI__builtin_ia32_ucmpw512_mask:
+  case X86::BI__builtin_ia32_ucmpd512_mask:
+  case X86::BI__builtin_ia32_ucmpq512_mask:
+  case X86::BI__builtin_ia32_vpcomub:
+  case X86::BI__builtin_ia32_vpcomuw:
+  case X86::BI__builtin_ia32_vpcomud:
+  case X86::BI__builtin_ia32_vpcomuq:
+  case X86::BI__builtin_ia32_vpcomb:
+  case X86::BI__builtin_ia32_vpcomw:
+  case X86::BI__builtin_ia32_vpcomd:
+  case X86::BI__builtin_ia32_vpcomq:
+  case X86::BI__builtin_ia32_vec_set_v8hi:
+  case X86::BI__builtin_ia32_vec_set_v8si:
+    i = 2;
+    l = 0;
+    u = 7;
+    break;
+  case X86::BI__builtin_ia32_vpermilpd256:
+  case X86::BI__builtin_ia32_roundps:
+  case X86::BI__builtin_ia32_roundpd:
+  case X86::BI__builtin_ia32_roundps256:
+  case X86::BI__builtin_ia32_roundpd256:
+  case X86::BI__builtin_ia32_getmantpd128_mask:
+  case X86::BI__builtin_ia32_getmantpd256_mask:
+  case X86::BI__builtin_ia32_getmantps128_mask:
+  case X86::BI__builtin_ia32_getmantps256_mask:
+  case X86::BI__builtin_ia32_getmantpd512_mask:
+  case X86::BI__builtin_ia32_getmantps512_mask:
+  case X86::BI__builtin_ia32_getmantph128_mask:
+  case X86::BI__builtin_ia32_getmantph256_mask:
+  case X86::BI__builtin_ia32_getmantph512_mask:
+  case X86::BI__builtin_ia32_vec_ext_v16qi:
+  case X86::BI__builtin_ia32_vec_ext_v16hi:
+    i = 1;
+    l = 0;
+    u = 15;
+    break;
+  case X86::BI__builtin_ia32_pblendd128:
+  case X86::BI__builtin_ia32_blendps:
+  case X86::BI__builtin_ia32_blendpd256:
+  case X86::BI__builtin_ia32_shufpd256:
+  case X86::BI__builtin_ia32_roundss:
+  case X86::BI__builtin_ia32_roundsd:
+  case X86::BI__builtin_ia32_rangepd128_mask:
+  case X86::BI__builtin_ia32_rangepd256_mask:
+  case X86::BI__builtin_ia32_rangepd512_mask:
+  case X86::BI__builtin_ia32_rangeps128_mask:
+  case X86::BI__builtin_ia32_rangeps256_mask:
+  case X86::BI__builtin_ia32_rangeps512_mask:
+  case X86::BI__builtin_ia32_getmantsd_round_mask:
+  case X86::BI__builtin_ia32_getmantss_round_mask:
+  case X86::BI__builtin_ia32_getmantsh_round_mask:
+  case X86::BI__builtin_ia32_vec_set_v16qi:
+  case X86::BI__builtin_ia32_vec_set_v16hi:
+    i = 2;
+    l = 0;
+    u = 15;
+    break;
+  case X86::BI__builtin_ia32_vec_ext_v32qi:
+    i = 1;
+    l = 0;
+    u = 31;
+    break;
+  case X86::BI__builtin_ia32_cmpps:
+  case X86::BI__builtin_ia32_cmpss:
+  case X86::BI__builtin_ia32_cmppd:
+  case X86::BI__builtin_ia32_cmpsd:
+  case X86::BI__builtin_ia32_cmpps256:
+  case X86::BI__builtin_ia32_cmppd256:
+  case X86::BI__builtin_ia32_cmpps128_mask:
+  case X86::BI__builtin_ia32_cmppd128_mask:
+  case X86::BI__builtin_ia32_cmpps256_mask:
+  case X86::BI__builtin_ia32_cmppd256_mask:
+  case X86::BI__builtin_ia32_cmpps512_mask:
+  case X86::BI__builtin_ia32_cmppd512_mask:
+  case X86::BI__builtin_ia32_cmpsd_mask:
+  case X86::BI__builtin_ia32_cmpss_mask:
+  case X86::BI__builtin_ia32_vec_set_v32qi:
+    i = 2;
+    l = 0;
+    u = 31;
+    break;
+  case X86::BI__builtin_ia32_permdf256:
+  case X86::BI__builtin_ia32_permdi256:
+  case X86::BI__builtin_ia32_permdf512:
+  case X86::BI__builtin_ia32_permdi512:
+  case X86::BI__builtin_ia32_vpermilps:
+  case X86::BI__builtin_ia32_vpermilps256:
+  case X86::BI__builtin_ia32_vpermilpd512:
+  case X86::BI__builtin_ia32_vpermilps512:
+  case X86::BI__builtin_ia32_pshufd:
+  case X86::BI__builtin_ia32_pshufd256:
+  case X86::BI__builtin_ia32_pshufd512:
+  case X86::BI__builtin_ia32_pshufhw:
+  case X86::BI__builtin_ia32_pshufhw256:
+  case X86::BI__builtin_ia32_pshufhw512:
+  case X86::BI__builtin_ia32_pshuflw:
+  case X86::BI__builtin_ia32_pshuflw256:
+  case X86::BI__builtin_ia32_pshuflw512:
+  case X86::BI__builtin_ia32_vcvtps2ph:
+  case X86::BI__builtin_ia32_vcvtps2ph_mask:
+  case X86::BI__builtin_ia32_vcvtps2ph256:
+  case X86::BI__builtin_ia32_vcvtps2ph256_mask:
+  case X86::BI__builtin_ia32_vcvtps2ph512_mask:
+  case X86::BI__builtin_ia32_rndscaleps_128_mask:
+  case X86::BI__builtin_ia32_rndscalepd_128_mask:
+  case X86::BI__builtin_ia32_rndscaleps_256_mask:
+  case X86::BI__builtin_ia32_rndscalepd_256_mask:
+  case X86::BI__builtin_ia32_rndscaleps_mask:
+  case X86::BI__builtin_ia32_rndscalepd_mask:
+  case X86::BI__builtin_ia32_rndscaleph_mask:
+  case X86::BI__builtin_ia32_reducepd128_mask:
+  case X86::BI__builtin_ia32_reducepd256_mask:
+  case X86::BI__builtin_ia32_reducepd512_mask:
+  case X86::BI__builtin_ia32_reduceps128_mask:
+  case X86::BI__builtin_ia32_reduceps256_mask:
+  case X86::BI__builtin_ia32_reduceps512_mask:
+  case X86::BI__builtin_ia32_reduceph128_mask:
+  case X86::BI__builtin_ia32_reduceph256_mask:
+  case X86::BI__builtin_ia32_reduceph512_mask:
+  case X86::BI__builtin_ia32_prold512:
+  case X86::BI__builtin_ia32_prolq512:
+  case X86::BI__builtin_ia32_prold128:
+  case X86::BI__builtin_ia32_prold256:
+  case X86::BI__builtin_ia32_prolq128:
+  case X86::BI__builtin_ia32_prolq256:
+  case X86::BI__builtin_ia32_prord512:
+  case X86::BI__builtin_ia32_prorq512:
+  case X86::BI__builtin_ia32_prord128:
+  case X86::BI__builtin_ia32_prord256:
+  case X86::BI__builtin_ia32_prorq128:
+  case X86::BI__builtin_ia32_prorq256:
+  case X86::BI__builtin_ia32_fpclasspd128_mask:
+  case X86::BI__builtin_ia32_fpclasspd256_mask:
+  case X86::BI__builtin_ia32_fpclassps128_mask:
+  case X86::BI__builtin_ia32_fpclassps256_mask:
+  case X86::BI__builtin_ia32_fpclassps512_mask:
+  case X86::BI__builtin_ia32_fpclasspd512_mask:
+  case X86::BI__builtin_ia32_fpclassph128_mask:
+  case X86::BI__builtin_ia32_fpclassph256_mask:
+  case X86::BI__builtin_ia32_fpclassph512_mask:
+  case X86::BI__builtin_ia32_fpclasssd_mask:
+  case X86::BI__builtin_ia32_fpclassss_mask:
+  case X86::BI__builtin_ia32_fpclasssh_mask:
+  case X86::BI__builtin_ia32_pslldqi128_byteshift:
+  case X86::BI__builtin_ia32_pslldqi256_byteshift:
+  case X86::BI__builtin_ia32_pslldqi512_byteshift:
+  case X86::BI__builtin_ia32_psrldqi128_byteshift:
+  case X86::BI__builtin_ia32_psrldqi256_byteshift:
+  case X86::BI__builtin_ia32_psrldqi512_byteshift:
+  case X86::BI__builtin_ia32_kshiftliqi:
+  case X86::BI__builtin_ia32_kshiftlihi:
+  case X86::BI__builtin_ia32_kshiftlisi:
+  case X86::BI__builtin_ia32_kshiftlidi:
+  case X86::BI__builtin_ia32_kshiftriqi:
+  case X86::BI__builtin_ia32_kshiftrihi:
+  case X86::BI__builtin_ia32_kshiftrisi:
+  case X86::BI__builtin_ia32_kshiftridi:
+    i = 1;
+    l = 0;
+    u = 255;
+    break;
+  case X86::BI__builtin_ia32_vperm2f128_pd256:
+  case X86::BI__builtin_ia32_vperm2f128_ps256:
+  case X86::BI__builtin_ia32_vperm2f128_si256:
+  case X86::BI__builtin_ia32_permti256:
+  case X86::BI__builtin_ia32_pblendw128:
+  case X86::BI__builtin_ia32_pblendw256:
+  case X86::BI__builtin_ia32_blendps256:
+  case X86::BI__builtin_ia32_pblendd256:
+  case X86::BI__builtin_ia32_palignr128:
+  case X86::BI__builtin_ia32_palignr256:
+  case X86::BI__builtin_ia32_palignr512:
+  case X86::BI__builtin_ia32_alignq512:
+  case X86::BI__builtin_ia32_alignd512:
+  case X86::BI__builtin_ia32_alignd128:
+  case X86::BI__builtin_ia32_alignd256:
+  case X86::BI__builtin_ia32_alignq128:
+  case X86::BI__builtin_ia32_alignq256:
+  case X86::BI__builtin_ia32_vcomisd:
+  case X86::BI__builtin_ia32_vcomiss:
+  case X86::BI__builtin_ia32_shuf_f32x4:
+  case X86::BI__builtin_ia32_shuf_f64x2:
+  case X86::BI__builtin_ia32_shuf_i32x4:
+  case X86::BI__builtin_ia32_shuf_i64x2:
+  case X86::BI__builtin_ia32_shufpd512:
+  case X86::BI__builtin_ia32_shufps:
+  case X86::BI__builtin_ia32_shufps256:
+  case X86::BI__builtin_ia32_shufps512:
+  case X86::BI__builtin_ia32_dbpsadbw128:
+  case X86::BI__builtin_ia32_dbpsadbw256:
+  case X86::BI__builtin_ia32_dbpsadbw512:
+  case X86::BI__builtin_ia32_vpshldd128:
+  case X86::BI__builtin_ia32_vpshldd256:
+  case X86::BI__builtin_ia32_vpshldd512:
+  case X86::BI__builtin_ia32_vpshldq128:
+  case X86::BI__builtin_ia32_vpshldq256:
+  case X86::BI__builtin_ia32_vpshldq512:
+  case X86::BI__builtin_ia32_vpshldw128:
+  case X86::BI__builtin_ia32_vpshldw256:
+  case X86::BI__builtin_ia32_vpshldw512:
+  case X86::BI__builtin_ia32_vpshrdd128:
+  case X86::BI__builtin_ia32_vpshrdd256:
+  case X86::BI__builtin_ia32_vpshrdd512:
+  case X86::BI__builtin_ia32_vpshrdq128:
+  case X86::BI__builtin_ia32_vpshrdq256:
+  case X86::BI__builtin_ia32_vpshrdq512:
+  case X86::BI__builtin_ia32_vpshrdw128:
+  case X86::BI__builtin_ia32_vpshrdw256:
+  case X86::BI__builtin_ia32_vpshrdw512:
+    i = 2;
+    l = 0;
+    u = 255;
+    break;
+  case X86::BI__builtin_ia32_fixupimmpd512_mask:
+  case X86::BI__builtin_ia32_fixupimmpd512_maskz:
+  case X86::BI__builtin_ia32_fixupimmps512_mask:
+  case X86::BI__builtin_ia32_fixupimmps512_maskz:
+  case X86::BI__builtin_ia32_fixupimmsd_mask:
+  case X86::BI__builtin_ia32_fixupimmsd_maskz:
+  case X86::BI__builtin_ia32_fixupimmss_mask:
+  case X86::BI__builtin_ia32_fixupimmss_maskz:
+  case X86::BI__builtin_ia32_fixupimmpd128_mask:
+  case X86::BI__builtin_ia32_fixupimmpd128_maskz:
+  case X86::BI__builtin_ia32_fixupimmpd256_mask:
+  case X86::BI__builtin_ia32_fixupimmpd256_maskz:
+  case X86::BI__builtin_ia32_fixupimmps128_mask:
+  case X86::BI__builtin_ia32_fixupimmps128_maskz:
+  case X86::BI__builtin_ia32_fixupimmps256_mask:
+  case X86::BI__builtin_ia32_fixupimmps256_maskz:
+  case X86::BI__builtin_ia32_pternlogd512_mask:
+  case X86::BI__builtin_ia32_pternlogd512_maskz:
+  case X86::BI__builtin_ia32_pternlogq512_mask:
+  case X86::BI__builtin_ia32_pternlogq512_maskz:
+  case X86::BI__builtin_ia32_pternlogd128_mask:
+  case X86::BI__builtin_ia32_pternlogd128_maskz:
+  case X86::BI__builtin_ia32_pternlogd256_mask:
+  case X86::BI__builtin_ia32_pternlogd256_maskz:
+  case X86::BI__builtin_ia32_pternlogq128_mask:
+  case X86::BI__builtin_ia32_pternlogq128_maskz:
+  case X86::BI__builtin_ia32_pternlogq256_mask:
+  case X86::BI__builtin_ia32_pternlogq256_maskz:
+  case X86::BI__builtin_ia32_vsm3rnds2:
+    i = 3;
+    l = 0;
+    u = 255;
+    break;
+  case X86::BI__builtin_ia32_reducesd_mask:
+  case X86::BI__builtin_ia32_reducess_mask:
+  case X86::BI__builtin_ia32_rndscalesd_round_mask:
+  case X86::BI__builtin_ia32_rndscaless_round_mask:
+  case X86::BI__builtin_ia32_rndscalesh_round_mask:
+  case X86::BI__builtin_ia32_reducesh_mask:
+    i = 4;
+    l = 0;
+    u = 255;
+    break;
+  case X86::BI__builtin_ia32_cmpccxadd32:
+  case X86::BI__builtin_ia32_cmpccxadd64:
+    i = 3;
+    l = 0;
+    u = 15;
+    break;
+  }
+
+  // Note that we don't force a hard error on the range check here, allowing
+  // template-generated or macro-generated dead code to potentially have out-of-
+  // range values. These need to code generate, but don't need to necessarily
+  // make any sense. We use a warning that defaults to an error.
+  return SemaRef.BuiltinConstantArgRange(TheCall, i, l, u,
+                                         /*RangeIsError*/ false);
+}
+
+} // namespace clang
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 6b53c2490cc4..dee335b52699 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -7337,7 +7337,7 @@ QualType TreeTransform<Derived>::TransformCountAttributedType(
   if (getDerived().AlwaysRebuild() || InnerTy != OldTy->desugar() ||
       OldCount != NewCount) {
     // Currently, CountAttributedType can only wrap incomplete array types.
-    Result = SemaRef.BuildCountAttributedArrayType(InnerTy, NewCount);
+    Result = SemaRef.BuildCountAttributedArrayOrPointerType(InnerTy, NewCount);
   }
 
   TLB.push<CountAttributedTypeLoc>(Result);
@@ -14114,6 +14114,13 @@ TreeTransform<Derived>::TransformCXXTemporaryObjectExpr(
     if (TransformExprs(E->getArgs(), E->getNumArgs(), true, Args,
                        &ArgumentChanged))
       return ExprError();
+
+    if (E->isListInitialization() && !E->isStdInitListInitialization()) {
+      ExprResult Res = RebuildInitList(E->getBeginLoc(), Args, E->getEndLoc());
+      if (Res.isInvalid())
+        return ExprError();
+      Args = {Res.get()};
+    }
   }
 
   if (!getDerived().AlwaysRebuild() &&
@@ -14125,12 +14132,9 @@ TreeTransform<Derived>::TransformCXXTemporaryObjectExpr(
     return SemaRef.MaybeBindToTemporary(E);
   }
 
-  // FIXME: We should just pass E->isListInitialization(), but we're not
-  // prepared to handle list-initialization without a child InitListExpr.
   SourceLocation LParenLoc = T->getTypeLoc().getEndLoc();
   return getDerived().RebuildCXXTemporaryObjectExpr(
-      T, LParenLoc, Args, E->getEndLoc(),
-      /*ListInitialization=*/LParenLoc.isInvalid());
+      T, LParenLoc, Args, E->getEndLoc(), E->isListInitialization());
 }
 
 template<typename Derived>
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index a6254b70560c..61cc99d4df68 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -2695,7 +2695,8 @@ void ASTDeclReader::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
   }
 
   if (Record.readInt())
-    D->setDefaultArgument(readTypeSourceInfo());
+    D->setDefaultArgument(Reader.getContext(),
+                          Record.readTemplateArgumentLoc());
 }
 
 void ASTDeclReader::VisitNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D) {
@@ -2716,7 +2717,8 @@ void ASTDeclReader::VisitNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D) {
     // Rest of NonTypeTemplateParmDecl.
     D->ParameterPack = Record.readInt();
     if (Record.readInt())
-      D->setDefaultArgument(Record.readExpr());
+      D->setDefaultArgument(Reader.getContext(),
+                            Record.readTemplateArgumentLoc());
   }
 }
 
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index c2f1d1b44241..bbd16dbdb8ff 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -1899,7 +1899,7 @@ void ASTDeclWriter::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
                         !D->defaultArgumentWasInherited();
   Record.push_back(OwnsDefaultArg);
   if (OwnsDefaultArg)
-    Record.AddTypeSourceInfo(D->getDefaultArgumentInfo());
+    Record.AddTemplateArgumentLoc(D->getDefaultArgument());
 
   if (!TC && !OwnsDefaultArg &&
       D->getDeclContext() == D->getLexicalDeclContext() &&
@@ -1941,7 +1941,7 @@ void ASTDeclWriter::VisitNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D) {
                           !D->defaultArgumentWasInherited();
     Record.push_back(OwnsDefaultArg);
     if (OwnsDefaultArg)
-      Record.AddStmt(D->getDefaultArgument());
+      Record.AddTemplateArgumentLoc(D->getDefaultArgument());
     Code = serialization::DECL_NON_TYPE_TEMPLATE_PARM;
   }
 }
diff --git a/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt b/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt
index 4443ffd09293..cd5a3bdd02e4 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt
+++ b/clang/lib/StaticAnalyzer/Checkers/CMakeLists.txt
@@ -96,13 +96,14 @@ add_clang_library(clangStaticAnalyzerCheckers
   PointerSortingChecker.cpp
   PointerSubChecker.cpp
   PthreadLockChecker.cpp
-  cert/PutenvWithAutoChecker.cpp
+  PutenvStackArrayChecker.cpp
   RetainCountChecker/RetainCountChecker.cpp
   RetainCountChecker/RetainCountDiagnostics.cpp
   ReturnPointerRangeChecker.cpp
   ReturnUndefChecker.cpp
   ReturnValueChecker.cpp
   RunLoopAutoreleaseLeakChecker.cpp
+  SetgidSetuidOrderChecker.cpp
   SimpleStreamChecker.cpp
   SmartPtrChecker.cpp
   SmartPtrModeling.cpp
diff --git a/clang/lib/StaticAnalyzer/Checkers/cert/PutenvWithAutoChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/PutenvStackArrayChecker.cpp
index a82f7caf16b2..d59cebf0aa5c 100644
--- a/clang/lib/StaticAnalyzer/Checkers/cert/PutenvWithAutoChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/PutenvStackArrayChecker.cpp
@@ -1,4 +1,4 @@
-//== PutenvWithAutoChecker.cpp --------------------------------- -*- C++ -*--=//
+//== PutenvStackArrayChecker.cpp ------------------------------- -*- C++ -*--=//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines PutenvWithAutoChecker which finds calls of ``putenv``
-// function with automatic variable as the argument.
+// This file defines PutenvStackArrayChecker which finds calls of ``putenv``
+// function with automatic array variable as the argument.
 // https://wiki.sei.cmu.edu/confluence/x/6NYxBQ
 //
 //===----------------------------------------------------------------------===//
 
-#include "../AllocationState.h"
+#include "AllocationState.h"
 #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
 #include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
 #include "clang/StaticAnalyzer/Core/Checker.h"
@@ -26,9 +26,9 @@ using namespace clang;
 using namespace ento;
 
 namespace {
-class PutenvWithAutoChecker : public Checker<check::PostCall> {
+class PutenvStackArrayChecker : public Checker<check::PostCall> {
 private:
-  BugType BT{this, "'putenv' function should not be called with auto variables",
+  BugType BT{this, "'putenv' called with stack-allocated string",
              categories::SecurityError};
   const CallDescription Putenv{CDM::CLibrary, {"putenv"}, 1};
 
@@ -37,8 +37,8 @@ public:
 };
 } // namespace
 
-void PutenvWithAutoChecker::checkPostCall(const CallEvent &Call,
-                                          CheckerContext &C) const {
+void PutenvStackArrayChecker::checkPostCall(const CallEvent &Call,
+                                            CheckerContext &C) const {
   if (!Putenv.matches(Call))
     return;
 
@@ -50,7 +50,7 @@ void PutenvWithAutoChecker::checkPostCall(const CallEvent &Call,
     return;
 
   StringRef ErrorMsg = "The 'putenv' function should not be called with "
-                       "arguments that have automatic storage";
+                       "arrays that have automatic storage";
   ExplodedNode *N = C.generateErrorNode();
   auto Report = std::make_unique<PathSensitiveBugReport>(BT, ErrorMsg, N);
 
@@ -60,8 +60,10 @@ void PutenvWithAutoChecker::checkPostCall(const CallEvent &Call,
   C.emitReport(std::move(Report));
 }
 
-void ento::registerPutenvWithAuto(CheckerManager &Mgr) {
-  Mgr.registerChecker<PutenvWithAutoChecker>();
+void ento::registerPutenvStackArray(CheckerManager &Mgr) {
+  Mgr.registerChecker<PutenvStackArrayChecker>();
 }
 
-bool ento::shouldRegisterPutenvWithAuto(const CheckerManager &) { return true; }
+bool ento::shouldRegisterPutenvStackArray(const CheckerManager &) {
+  return true;
+}
diff --git a/clang/lib/StaticAnalyzer/Checkers/SetgidSetuidOrderChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/SetgidSetuidOrderChecker.cpp
new file mode 100644
index 000000000000..dbe3fd33a6b4
--- /dev/null
+++ b/clang/lib/StaticAnalyzer/Checkers/SetgidSetuidOrderChecker.cpp
@@ -0,0 +1,196 @@
+//===-- SetgidSetuidOrderChecker.cpp - check privilege revocation calls ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines a checker to detect possible reversed order of privilege
+//  revocations when 'setgid' and 'setuid' is used.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
+#include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
+#include "clang/StaticAnalyzer/Core/Checker.h"
+#include "clang/StaticAnalyzer/Core/CheckerManager.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h"
+
+using namespace clang;
+using namespace ento;
+
+namespace {
+
+enum SetPrivilegeFunctionKind { Irrelevant, Setuid, Setgid };
+
+class SetgidSetuidOrderChecker : public Checker<check::PostCall, eval::Assume> {
+  const BugType BT{this, "Possible wrong order of privilege revocation"};
+
+  const CallDescription SetuidDesc{CDM::CLibrary, {"setuid"}, 1};
+  const CallDescription SetgidDesc{CDM::CLibrary, {"setgid"}, 1};
+
+  const CallDescription GetuidDesc{CDM::CLibrary, {"getuid"}, 0};
+  const CallDescription GetgidDesc{CDM::CLibrary, {"getgid"}, 0};
+
+  const CallDescriptionSet OtherSetPrivilegeDesc{
+      {CDM::CLibrary, {"seteuid"}, 1},   {CDM::CLibrary, {"setegid"}, 1},
+      {CDM::CLibrary, {"setreuid"}, 2},  {CDM::CLibrary, {"setregid"}, 2},
+      {CDM::CLibrary, {"setresuid"}, 3}, {CDM::CLibrary, {"setresgid"}, 3}};
+
+public:
+  void checkPostCall(const CallEvent &Call, CheckerContext &C) const;
+  ProgramStateRef evalAssume(ProgramStateRef State, SVal Cond,
+                             bool Assumption) const;
+
+private:
+  void processSetuid(ProgramStateRef State, const CallEvent &Call,
+                     CheckerContext &C) const;
+  void processSetgid(ProgramStateRef State, const CallEvent &Call,
+                     CheckerContext &C) const;
+  void processOther(ProgramStateRef State, const CallEvent &Call,
+                    CheckerContext &C) const;
+  /// Check if a function like \c getuid or \c getgid is called directly from
+  /// the first argument of function called from \a Call.
+  bool isFunctionCalledInArg(const CallDescription &Desc,
+                             const CallEvent &Call) const;
+  void emitReport(ProgramStateRef State, CheckerContext &C) const;
+};
+
+} // end anonymous namespace
+
+/// Store if there was a call to 'setuid(getuid())' or 'setgid(getgid())' not
+/// followed by other different privilege-change functions.
+/// If the value \c Setuid is stored and a 'setgid(getgid())' call is found we
+/// have found the bug to be reported. Value \c Setgid is used too to prevent
+/// warnings at a setgid-setuid-setgid sequence.
+REGISTER_TRAIT_WITH_PROGRAMSTATE(LastSetPrivilegeCall, SetPrivilegeFunctionKind)
+/// Store the symbol value of the last 'setuid(getuid())' call. This is used to
+/// detect if the result is compared to -1 and avoid warnings on that branch
+/// (which is the failure branch of the call), and for identification of note
+/// tags.
+REGISTER_TRAIT_WITH_PROGRAMSTATE(LastSetuidCallSVal, SymbolRef)
+
+void SetgidSetuidOrderChecker::checkPostCall(const CallEvent &Call,
+                                             CheckerContext &C) const {
+  ProgramStateRef State = C.getState();
+  if (SetuidDesc.matches(Call)) {
+    processSetuid(State, Call, C);
+  } else if (SetgidDesc.matches(Call)) {
+    processSetgid(State, Call, C);
+  } else if (OtherSetPrivilegeDesc.contains(Call)) {
+    processOther(State, Call, C);
+  }
+}
+
+ProgramStateRef SetgidSetuidOrderChecker::evalAssume(ProgramStateRef State,
+                                                     SVal Cond,
+                                                     bool Assumption) const {
+  SValBuilder &SVB = State->getStateManager().getSValBuilder();
+  SymbolRef LastSetuidSym = State->get<LastSetuidCallSVal>();
+  if (!LastSetuidSym)
+    return State;
+
+  // Check if the most recent call to 'setuid(getuid())' is assumed to be != 0.
+  // It should be only -1 at failure, but we want to accept a "!= 0" check too.
+  // (But now an invalid failure check like "!= 1" will be recognized as correct
+  // too. The "invalid failure check" is a different bug that is not the scope
+  // of this checker.)
+  auto FailComparison =
+      SVB.evalBinOpNN(State, BO_NE, nonloc::SymbolVal(LastSetuidSym),
+                      SVB.makeIntVal(0, /*isUnsigned=*/false),
+                      SVB.getConditionType())
+          .getAs<DefinedOrUnknownSVal>();
+  if (!FailComparison)
+    return State;
+  if (auto IsFailBranch = State->assume(*FailComparison);
+      IsFailBranch.first && !IsFailBranch.second) {
+    // This is the 'setuid(getuid())' != 0 case.
+    // On this branch we do not want to emit warning.
+    State = State->set<LastSetPrivilegeCall>(Irrelevant);
+    State = State->set<LastSetuidCallSVal>(SymbolRef{});
+  }
+  return State;
+}
+
+void SetgidSetuidOrderChecker::processSetuid(ProgramStateRef State,
+                                             const CallEvent &Call,
+                                             CheckerContext &C) const {
+  bool IsSetuidWithGetuid = isFunctionCalledInArg(GetuidDesc, Call);
+  if (State->get<LastSetPrivilegeCall>() != Setgid && IsSetuidWithGetuid) {
+    SymbolRef RetSym = Call.getReturnValue().getAsSymbol();
+    State = State->set<LastSetPrivilegeCall>(Setuid);
+    State = State->set<LastSetuidCallSVal>(RetSym);
+    const NoteTag *Note = C.getNoteTag([this,
+                                        RetSym](PathSensitiveBugReport &BR) {
+      if (!BR.isInteresting(RetSym) || &BR.getBugType() != &this->BT)
+        return "";
+      return "Call to 'setuid' found here that removes superuser privileges";
+    });
+    C.addTransition(State, Note);
+    return;
+  }
+  State = State->set<LastSetPrivilegeCall>(Irrelevant);
+  State = State->set<LastSetuidCallSVal>(SymbolRef{});
+  C.addTransition(State);
+}
+
+void SetgidSetuidOrderChecker::processSetgid(ProgramStateRef State,
+                                             const CallEvent &Call,
+                                             CheckerContext &C) const {
+  bool IsSetgidWithGetgid = isFunctionCalledInArg(GetgidDesc, Call);
+  if (State->get<LastSetPrivilegeCall>() == Setuid) {
+    if (IsSetgidWithGetgid) {
+      State = State->set<LastSetPrivilegeCall>(Irrelevant);
+      emitReport(State, C);
+      return;
+    }
+    State = State->set<LastSetPrivilegeCall>(Irrelevant);
+  } else {
+    State = State->set<LastSetPrivilegeCall>(IsSetgidWithGetgid ? Setgid
+                                                                : Irrelevant);
+  }
+  State = State->set<LastSetuidCallSVal>(SymbolRef{});
+  C.addTransition(State);
+}
+
+void SetgidSetuidOrderChecker::processOther(ProgramStateRef State,
+                                            const CallEvent &Call,
+                                            CheckerContext &C) const {
+  State = State->set<LastSetuidCallSVal>(SymbolRef{});
+  State = State->set<LastSetPrivilegeCall>(Irrelevant);
+  C.addTransition(State);
+}
+
+bool SetgidSetuidOrderChecker::isFunctionCalledInArg(
+    const CallDescription &Desc, const CallEvent &Call) const {
+  if (const auto *CallInArg0 =
+          dyn_cast<CallExpr>(Call.getArgExpr(0)->IgnoreParenImpCasts()))
+    return Desc.matchesAsWritten(*CallInArg0);
+  return false;
+}
+
+void SetgidSetuidOrderChecker::emitReport(ProgramStateRef State,
+                                          CheckerContext &C) const {
+  if (ExplodedNode *N = C.generateNonFatalErrorNode(State)) {
+    llvm::StringLiteral Msg =
+        "A 'setgid(getgid())' call following a 'setuid(getuid())' "
+        "call is likely to fail; probably the order of these "
+        "statements is wrong";
+    auto Report = std::make_unique<PathSensitiveBugReport>(BT, Msg, N);
+    Report->markInteresting(State->get<LastSetuidCallSVal>());
+    C.emitReport(std::move(Report));
+  }
+}
+
+void ento::registerSetgidSetuidOrderChecker(CheckerManager &mgr) {
+  mgr.registerChecker<SetgidSetuidOrderChecker>();
+}
+
+bool ento::shouldRegisterSetgidSetuidOrderChecker(const CheckerManager &mgr) {
+  return true;
+}
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index 5c797d523308..49bbff194216 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -271,6 +271,43 @@ public:
 
   TrivialFunctionAnalysisVisitor(CacheTy &Cache) : Cache(Cache) {}
 
+  bool IsFunctionTrivial(const Decl *D) {
+    auto CacheIt = Cache.find(D);
+    if (CacheIt != Cache.end())
+      return CacheIt->second;
+
+    // Treat a recursive function call to be trivial until proven otherwise.
+    auto [RecursiveIt, IsNew] = RecursiveFn.insert(std::make_pair(D, true));
+    if (!IsNew)
+      return RecursiveIt->second;
+
+    bool Result = [&]() {
+      if (auto *CtorDecl = dyn_cast<CXXConstructorDecl>(D)) {
+        for (auto *CtorInit : CtorDecl->inits()) {
+          if (!Visit(CtorInit->getInit()))
+            return false;
+        }
+      }
+      const Stmt *Body = D->getBody();
+      if (!Body)
+        return false;
+      return Visit(Body);
+    }();
+
+    if (!Result) {
+      // D and its mutually recursive callers are all non-trivial.
+      for (auto &It : RecursiveFn)
+        It.second = false;
+    }
+    RecursiveIt = RecursiveFn.find(D);
+    assert(RecursiveIt != RecursiveFn.end());
+    Result = RecursiveIt->second;
+    RecursiveFn.erase(RecursiveIt);
+    Cache[D] = Result;
+
+    return Result;
+  }
+
   bool VisitStmt(const Stmt *S) {
     // All statements are non-trivial unless overriden later.
     // Don't even recurse into children by default.
@@ -368,7 +405,7 @@ public:
         Name == "bitwise_cast" || Name.find("__builtin") == 0)
       return true;
 
-    return TrivialFunctionAnalysis::isTrivialImpl(Callee, Cache);
+    return IsFunctionTrivial(Callee);
   }
 
   bool
@@ -403,7 +440,7 @@ public:
       return true;
 
     // Recursively descend into the callee to confirm that it's trivial as well.
-    return TrivialFunctionAnalysis::isTrivialImpl(Callee, Cache);
+    return IsFunctionTrivial(Callee);
   }
 
   bool VisitCXXOperatorCallExpr(const CXXOperatorCallExpr *OCE) {
@@ -413,7 +450,7 @@ public:
     if (!Callee)
       return false;
     // Recursively descend into the callee to confirm that it's trivial as well.
-    return TrivialFunctionAnalysis::isTrivialImpl(Callee, Cache);
+    return IsFunctionTrivial(Callee);
   }
 
   bool VisitCXXDefaultArgExpr(const CXXDefaultArgExpr *E) {
@@ -439,7 +476,7 @@ public:
     }
 
     // Recursively descend into the callee to confirm that it's trivial.
-    return TrivialFunctionAnalysis::isTrivialImpl(CE->getConstructor(), Cache);
+    return IsFunctionTrivial(CE->getConstructor());
   }
 
   bool VisitCXXNewExpr(const CXXNewExpr *NE) { return VisitChildren(NE); }
@@ -513,36 +550,13 @@ public:
 
 private:
   CacheTy &Cache;
+  CacheTy RecursiveFn;
 };
 
 bool TrivialFunctionAnalysis::isTrivialImpl(
     const Decl *D, TrivialFunctionAnalysis::CacheTy &Cache) {
-  // If the function isn't in the cache, conservatively assume that
-  // it's not trivial until analysis completes. This makes every recursive
-  // function non-trivial. This also guarantees that each function
-  // will be scanned at most once.
-  auto [It, IsNew] = Cache.insert(std::make_pair(D, false));
-  if (!IsNew)
-    return It->second;
-
   TrivialFunctionAnalysisVisitor V(Cache);
-
-  if (auto *CtorDecl = dyn_cast<CXXConstructorDecl>(D)) {
-    for (auto *CtorInit : CtorDecl->inits()) {
-      if (!V.Visit(CtorInit->getInit()))
-        return false;
-    }
-  }
-
-  const Stmt *Body = D->getBody();
-  if (!Body)
-    return false;
-
-  bool Result = V.Visit(Body);
-  if (Result)
-    Cache[D] = true;
-
-  return Result;
+  return V.IsFunctionTrivial(D);
 }
 
 bool TrivialFunctionAnalysis::isTrivialImpl(
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp
index 0d9710a5e2d8..274da0baf2ce 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp
@@ -135,7 +135,19 @@ public:
       bool shouldVisitImplicitCode() const { return false; }
 
       bool VisitVarDecl(VarDecl *V) {
-        Checker->visitVarDecl(V);
+        auto *Init = V->getInit();
+        if (Init && V->isLocalVarDecl())
+          Checker->visitVarDecl(V, Init);
+        return true;
+      }
+
+      bool VisitBinaryOperator(const BinaryOperator *BO) {
+        if (BO->isAssignmentOp()) {
+          if (auto *VarRef = dyn_cast<DeclRefExpr>(BO->getLHS())) {
+            if (auto *V = dyn_cast<VarDecl>(VarRef->getDecl()))
+              Checker->visitVarDecl(V, BO->getRHS());
+          }
+        }
         return true;
       }
 
@@ -174,7 +186,7 @@ public:
     visitor.TraverseDecl(const_cast<TranslationUnitDecl *>(TUD));
   }
 
-  void visitVarDecl(const VarDecl *V) const {
+  void visitVarDecl(const VarDecl *V, const Expr *Value) const {
     if (shouldSkipVarDecl(V))
       return;
 
@@ -184,12 +196,8 @@ public:
 
     std::optional<bool> IsUncountedPtr = isUncountedPtr(ArgType);
     if (IsUncountedPtr && *IsUncountedPtr) {
-      const Expr *const InitExpr = V->getInit();
-      if (!InitExpr)
-        return; // FIXME: later on we might warn on uninitialized vars too
-
       if (tryToFindPtrOrigin(
-              InitExpr, /*StopAtFirstRefCountedObj=*/false,
+              Value, /*StopAtFirstRefCountedObj=*/false,
               [&](const clang::Expr *InitArgOrigin, bool IsSafe) {
                 if (!InitArgOrigin)
                   return true;
@@ -232,34 +240,46 @@ public:
               }))
         return;
 
-      reportBug(V);
+      reportBug(V, Value);
     }
   }
 
   bool shouldSkipVarDecl(const VarDecl *V) const {
     assert(V);
-    if (!V->isLocalVarDecl())
-      return true;
-
-    if (BR->getSourceManager().isInSystemHeader(V->getLocation()))
-      return true;
-
-    return false;
+    return BR->getSourceManager().isInSystemHeader(V->getLocation());
   }
 
-  void reportBug(const VarDecl *V) const {
+  void reportBug(const VarDecl *V, const Expr *Value) const {
     assert(V);
     SmallString<100> Buf;
     llvm::raw_svector_ostream Os(Buf);
 
-    Os << "Local variable ";
-    printQuotedQualifiedName(Os, V);
-    Os << " is uncounted and unsafe.";
-
-    PathDiagnosticLocation BSLoc(V->getLocation(), BR->getSourceManager());
-    auto Report = std::make_unique<BasicBugReport>(Bug, Os.str(), BSLoc);
-    Report->addRange(V->getSourceRange());
-    BR->emitReport(std::move(Report));
+    if (dyn_cast<ParmVarDecl>(V)) {
+      Os << "Assignment to an uncounted parameter ";
+      printQuotedQualifiedName(Os, V);
+      Os << " is unsafe.";
+
+      PathDiagnosticLocation BSLoc(Value->getExprLoc(), BR->getSourceManager());
+      auto Report = std::make_unique<BasicBugReport>(Bug, Os.str(), BSLoc);
+      Report->addRange(Value->getSourceRange());
+      BR->emitReport(std::move(Report));
+    } else {
+      if (V->hasLocalStorage())
+        Os << "Local variable ";
+      else if (V->isStaticLocal())
+        Os << "Static local variable ";
+      else if (V->hasGlobalStorage())
+        Os << "Global variable ";
+      else
+        Os << "Variable ";
+      printQuotedQualifiedName(Os, V);
+      Os << " is uncounted and unsafe.";
+
+      PathDiagnosticLocation BSLoc(V->getLocation(), BR->getSourceManager());
+      auto Report = std::make_unique<BasicBugReport>(Bug, Os.str(), BSLoc);
+      Report->addRange(V->getSourceRange());
+      BR->emitReport(std::move(Report));
+    }
   }
 };
 } // namespace
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 0b1edf3e5c96..793f3a63ea29 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1970,33 +1970,45 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
       ExplodedNodeSet Tmp;
       StmtNodeBuilder Bldr2(PreVisit, Tmp, *currBldrCtx);
 
-      const Expr *ArgE;
-      if (const auto *DefE = dyn_cast<CXXDefaultArgExpr>(S))
+      bool HasRewrittenInit = false;
+      const Expr *ArgE = nullptr;
+      if (const auto *DefE = dyn_cast<CXXDefaultArgExpr>(S)) {
         ArgE = DefE->getExpr();
-      else if (const auto *DefE = dyn_cast<CXXDefaultInitExpr>(S))
+        HasRewrittenInit = DefE->hasRewrittenInit();
+      } else if (const auto *DefE = dyn_cast<CXXDefaultInitExpr>(S)) {
         ArgE = DefE->getExpr();
-      else
+        HasRewrittenInit = DefE->hasRewrittenInit();
+      } else
         llvm_unreachable("unknown constant wrapper kind");
 
-      bool IsTemporary = false;
-      if (const auto *MTE = dyn_cast<MaterializeTemporaryExpr>(ArgE)) {
-        ArgE = MTE->getSubExpr();
-        IsTemporary = true;
-      }
+      if (HasRewrittenInit) {
+        for (auto *N : PreVisit) {
+          ProgramStateRef state = N->getState();
+          const LocationContext *LCtx = N->getLocationContext();
+          state = state->BindExpr(S, LCtx, state->getSVal(ArgE, LCtx));
+          Bldr2.generateNode(S, N, state);
+        }
+      } else {
+        // If it's not rewritten, the contents of these expressions are not
+        // actually part of the current function, so we fall back to constant
+        // evaluation.
+        bool IsTemporary = false;
+        if (const auto *MTE = dyn_cast<MaterializeTemporaryExpr>(ArgE)) {
+          ArgE = MTE->getSubExpr();
+          IsTemporary = true;
+        }
+
+        std::optional<SVal> ConstantVal = svalBuilder.getConstantVal(ArgE);
+        const LocationContext *LCtx = Pred->getLocationContext();
+        for (auto *I : PreVisit) {
+          ProgramStateRef State = I->getState();
+          State = State->BindExpr(S, LCtx, ConstantVal.value_or(UnknownVal()));
+          if (IsTemporary)
+            State = createTemporaryRegionIfNeeded(State, LCtx, cast<Expr>(S),
+                                                  cast<Expr>(S));
 
-      std::optional<SVal> ConstantVal = svalBuilder.getConstantVal(ArgE);
-      if (!ConstantVal)
-        ConstantVal = UnknownVal();
-
-      const LocationContext *LCtx = Pred->getLocationContext();
-      for (const auto I : PreVisit) {
-        ProgramStateRef State = I->getState();
-        State = State->BindExpr(S, LCtx, *ConstantVal);
-        if (IsTemporary)
-          State = createTemporaryRegionIfNeeded(State, LCtx,
-                                                cast<Expr>(S),
-                                                cast<Expr>(S));
-        Bldr2.generateNode(S, I, State);
+          Bldr2.generateNode(S, I, State);
+        }
       }
 
       getCheckerManager().runCheckersForPostStmt(Dst, Tmp, S, *this);
diff --git a/clang/test/AST/Interp/arrays.cpp b/clang/test/AST/Interp/arrays.cpp
index e936ec6dc894..dd5064d993e6 100644
--- a/clang/test/AST/Interp/arrays.cpp
+++ b/clang/test/AST/Interp/arrays.cpp
@@ -54,6 +54,10 @@ constexpr int derefPtr(const int *d) {
 }
 static_assert(derefPtr(data) == 5, "");
 
+/// Make sure we can refer to the one-past-the-end element
+/// and then return back to the end of the array.
+static_assert((&data[5])[-1] == 1, "");
+
 constexpr int storePtr() {
   int b[] = {1,2,3,4};
   int *c = b;
diff --git a/clang/test/AST/Interp/builtin-functions.cpp b/clang/test/AST/Interp/builtin-functions.cpp
index fbe76aba73c9..0a17106449fa 100644
--- a/clang/test/AST/Interp/builtin-functions.cpp
+++ b/clang/test/AST/Interp/builtin-functions.cpp
@@ -900,7 +900,7 @@ namespace shufflevector {
   static_assert(vectorShuffle6[7] == 7, "");// ref-error {{not an integral constant expression}}
 
   constexpr vector4char  vectorShuffleFail1 = __builtin_shufflevector( // both-error {{must be initialized by a constant expression}}\
-                                                                       // ref-error {{index for __builtin_shufflevector not within the bounds of the input vectors; index of -1 found at position 0 not permitted in a constexpr context.}}
+                                                                       // ref-error {{index for __builtin_shufflevector not within the bounds of the input vectors; index of -1 found at position 0 is not permitted in a constexpr context}}
           vector4charConst1,
           vector4charConst2, -1, -1, -1, -1);
 }
diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index 2a75457a4693..f4c7bf16f2f9 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -278,3 +278,15 @@ void addrlabelexpr(void) {
  a0: ;
   static void *ps[] = { &&a0 }; // pedantic-warning {{use of GNU address-of-label extension}}
 }
+
+extern void cv2;
+void *foo5 (void)
+{
+  return &cv2; // pedantic-warning{{address of an expression of type 'void'}}
+}
+
+__attribute__((weak)) const unsigned int test10_bound = 10;
+char test10_global[test10_bound]; // all-error {{variable length array declaration not allowed at file scope}}
+void test10(void) {
+  char test10_local[test10_bound] = "help"; // all-error {{variable-sized object may not be initialized}}
+}
diff --git a/clang/test/AST/Interp/cxx03.cpp b/clang/test/AST/Interp/cxx03.cpp
index b6aaf0840cfb..70ae4134842b 100644
--- a/clang/test/AST/Interp/cxx03.cpp
+++ b/clang/test/AST/Interp/cxx03.cpp
@@ -24,3 +24,8 @@ namespace NonLValueMemberExpr {
 
   const int &TT1::subobj_init = PODType().value;
 }
+
+void LambdaAccessingADummy() {
+  int d;
+  int a9[1] = {[d = 0] = 1}; // both-error {{is not an integral constant expression}}
+}
diff --git a/clang/test/AST/Interp/cxx98.cpp b/clang/test/AST/Interp/cxx98.cpp
index be81735329db..e68e4dbc8d74 100644
--- a/clang/test/AST/Interp/cxx98.cpp
+++ b/clang/test/AST/Interp/cxx98.cpp
@@ -50,3 +50,7 @@ _Static_assert(c0_test == 0, "");
 int a = 0; // both-note {{declared here}}
 _Static_assert(a == 0, ""); // both-error {{static assertion expression is not an integral constant expression}} \
                             // both-note {{read of non-const variable 'a' is not allowed in a constant expression}}
+
+struct SelfReference { SelfReference &r; };
+extern SelfReference self_reference_1;
+SelfReference self_reference_2 = {self_reference_1};
diff --git a/clang/test/AST/Interp/eval-order.cpp b/clang/test/AST/Interp/eval-order.cpp
index 695a43c9d235..aaf2b74510bb 100644
--- a/clang/test/AST/Interp/eval-order.cpp
+++ b/clang/test/AST/Interp/eval-order.cpp
@@ -1,8 +1,7 @@
-// RUN: %clang_cc1 -std=c++1z -verify %s -fcxx-exceptions -triple=x86_64-linux-gnu
-// RUN: %clang_cc1 -std=c++1z -verify %s -fcxx-exceptions -triple=x86_64-linux-gnu -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -std=c++1z -verify=ref,both %s -fcxx-exceptions -triple=x86_64-linux-gnu
+// RUN: %clang_cc1 -std=c++1z -verify=expected,both %s -fcxx-exceptions -triple=x86_64-linux-gnu -fexperimental-new-constant-interpreter
 
 // ref-no-diagnostics
-// expected-no-diagnostics
 
 /// Check that assignment operators evaluate their operands right-to-left.
 /// Copied from test/SemaCXX/constant-expression-cxx1z.cpp
@@ -46,7 +45,7 @@ namespace EvalOrder {
     }
     template <typename T> constexpr T &&b(T &&v) {
       if (!done_a)
-        throw "wrong";
+        throw "wrong"; // expected-note 7{{not valid}}
       done_b = true;
       return (T &&)v;
     }
@@ -76,21 +75,30 @@ namespace EvalOrder {
   // SEQ(A(&ud)->*B(&UserDefined::n)); FIXME
 
   // Rule 4: a(b1, b2, b3)
-  // SEQ(A(f)(B(1), B(2), B(3))); FIXME
+  SEQ(A(f)(B(1), B(2), B(3))); // expected-error {{not an integral constant expression}} FIXME \
+                               // expected-note 2{{in call to}}
 
   // Rule 5: b = a, b @= a
-  // SEQ(B(lvalue<int>().get()) = A(0)); FIXME
-  // SEQ(B(lvalue<UserDefined>().get()) = A(ud)); FIXME
+  SEQ(B(lvalue<int>().get()) = A(0)); // expected-error {{not an integral constant expression}} FIXME \
+                                      // expected-note 2{{in call to}}
+  SEQ(B(lvalue<UserDefined>().get()) = A(ud)); // expected-error {{not an integral constant expression}} FIXME \
+                                               // expected-note 2{{in call to}}
   SEQ(B(lvalue<int>().get()) += A(0));
-  // SEQ(B(lvalue<UserDefined>().get()) += A(ud)); FIXME
-  // SEQ(B(lvalue<NonMember>().get()) += A(nm)); FIXME
+  SEQ(B(lvalue<UserDefined>().get()) += A(ud)); // expected-error {{not an integral constant expression}} FIXME \
+                                                // expected-note 2{{in call to}}
+
+  SEQ(B(lvalue<NonMember>().get()) += A(nm)); // expected-error {{not an integral constant expression}} FIXME \
+                                              // expected-note 2{{in call to}}
+
 
   // Rule 6: a[b]
   constexpr int arr[3] = {};
   SEQ(A(arr)[B(0)]);
   SEQ(A(+arr)[B(0)]);
-  // SEQ(A(0)[B(arr)]); FIXME
-  // SEQ(A(0)[B(+arr)]); FIXME
+  SEQ(A(0)[B(arr)]); // expected-error {{not an integral constant expression}} FIXME \
+                     // expected-note 2{{in call to}}
+  SEQ(A(0)[B(+arr)]); // expected-error {{not an integral constant expression}} FIXME \
+                      // expected-note 2{{in call to}}
   SEQ(A(ud)[B(0)]);
 
   // Rule 7: a << b
diff --git a/clang/test/AST/Interp/objc.mm b/clang/test/AST/Interp/objc.mm
new file mode 100644
index 000000000000..44b74d193b66
--- /dev/null
+++ b/clang/test/AST/Interp/objc.mm
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both %s
+// RUN: %clang_cc1 -verify=ref,both %s
+
+@interface A {
+  int a;
+  static_assert(a, ""); // both-error {{static assertion expression is not an integral constant expression}}
+}
+@end
diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp
index 3a5ecd291a56..97ac3e916955 100644
--- a/clang/test/AST/Interp/records.cpp
+++ b/clang/test/AST/Interp/records.cpp
@@ -1335,8 +1335,6 @@ namespace UnnamedBitFields {
   static_assert(a.c == 'a', "");
 }
 
-/// FIXME: This still doesn't work in the new interpreter because
-/// we lack type information for dummy pointers.
 namespace VirtualBases {
   /// This used to crash.
   namespace One {
@@ -1346,7 +1344,7 @@ namespace VirtualBases {
     };
     class B : public virtual A {
     public:
-      int getX() { return x; } // ref-note {{declared here}}
+      int getX() { return x; } // both-note {{declared here}}
     };
 
     class DV : virtual public B{};
@@ -1354,7 +1352,7 @@ namespace VirtualBases {
     void foo() {
       DV b;
       int a[b.getX()]; // both-warning {{variable length arrays}} \
-                       // ref-note {{non-constexpr function 'getX' cannot be used}}
+                       // both-note {{non-constexpr function 'getX' cannot be used}}
     }
   }
 
diff --git a/clang/test/AST/Interp/unions.cpp b/clang/test/AST/Interp/unions.cpp
new file mode 100644
index 000000000000..293a1981a52f
--- /dev/null
+++ b/clang/test/AST/Interp/unions.cpp
@@ -0,0 +1,67 @@
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both %s
+// RUN: %clang_cc1 -verify=ref,both %s
+
+union U {
+  int a;
+  int b;
+};
+
+constexpr U a = {12};
+static_assert(a.a == 12, "");
+static_assert(a.b == 0, ""); // both-error {{not an integral constant expression}} \
+                             // both-note {{read of member 'b' of union with active member 'a'}}
+union U1 {
+  int i;
+  float f = 3.0f;
+};
+constexpr U1 u1{};
+static_assert(u1.f == 3.0, "");
+static_assert(u1.i == 1, ""); // both-error {{not an integral constant expression}} \
+                              // both-note {{read of member 'i' of union with active member 'f'}}
+
+
+
+union A {
+  int a;
+  double d;
+};
+constexpr A aa = {1, 2.0}; // both-error {{excess elements in union initializer}}
+constexpr A ab = {.d = 1.0};
+static_assert(ab.d == 1.0, "");
+static_assert(ab.a == 1, ""); // both-error {{not an integral constant expression}} \
+                              // both-note {{read of member 'a' of union with active member 'd'}}
+
+
+namespace Empty {
+  union E {};
+  constexpr E e{};
+}
+
+namespace SimpleStore {
+  union A {
+    int a;
+    int b;
+  };
+  constexpr int foo() {
+    A a{.b = 4};
+    a.b = 10;
+    return a.b;
+  }
+  static_assert(foo() == 10, "");
+
+  constexpr int empty() {
+    A a{}; /// Just test that this works.
+    return 10;
+  }
+  static_assert(empty() == 10, "");
+}
+
+namespace ZeroInit {
+  struct S { int m; };
+  union Z {
+    float f;
+  };
+
+  constexpr Z z{};
+  static_assert(z.f == 0.0, "");
+}
diff --git a/clang/test/AST/ast-dump-decl.cpp b/clang/test/AST/ast-dump-decl.cpp
index 554cdcf83fcd..e062d4f068a4 100644
--- a/clang/test/AST/ast-dump-decl.cpp
+++ b/clang/test/AST/ast-dump-decl.cpp
@@ -459,7 +459,7 @@ namespace testClassTemplateDecl {
 
 // CHECK:       ClassTemplateDecl 0x{{.+}} <{{.+}}:[[@LINE-148]]:3, col:31> col:31 TestTemplateDefaultNonType{{$}}
 // CHECK-NEXT:  |-NonTypeTemplateParmDecl 0x{{.+}} <col:12, col:20> col:16 'int' depth 0 index 0 I{{$}}
-// CHECK-NEXT:  | `-TemplateArgument expr{{$}}
+// CHECK-NEXT:  | `-TemplateArgument <col:20> expr{{$}}
 // CHECK-NEXT:  |   `-IntegerLiteral 0x{{.+}} <col:20> 'int' 42{{$}}
 // CHECK-NEXT:  `-CXXRecordDecl 0x{{.+}} <col:24, col:31> col:31 struct TestTemplateDefaultNonType{{$}}
 
@@ -671,7 +671,7 @@ namespace TestNonTypeTemplateParmDecl {
 // CHECK:      NamespaceDecl{{.*}} TestNonTypeTemplateParmDecl
 // CHECK-NEXT:   FunctionTemplateDecl
 // CHECK-NEXT:     NonTypeTemplateParmDecl{{.*}} 'int' depth 0 index 0 I
-// CHECK-NEXT:       TemplateArgument expr
+// CHECK-NEXT:       TemplateArgument {{.*}} expr
 // CHECK-NEXT:         IntegerLiteral{{.*}} 'int' 1
 // CHECK-NEXT:     NonTypeTemplateParmDecl{{.*}} 'int' depth 0 index 1 ... J
 
diff --git a/clang/test/AST/ast-dump-default-init-json.cpp b/clang/test/AST/ast-dump-default-init-json.cpp
index 1058b4e3ea4d..f4949a9c9eed 100644
--- a/clang/test/AST/ast-dump-default-init-json.cpp
+++ b/clang/test/AST/ast-dump-default-init-json.cpp
@@ -789,10 +789,10 @@ void test() {
 // CHECK-NEXT:                  "valueCategory": "lvalue",
 // CHECK-NEXT:                  "extendingDecl": {
 // CHECK-NEXT:                   "id": "0x{{.*}}",
-// CHECK-NEXT:                   "kind": "FieldDecl",
-// CHECK-NEXT:                   "name": "a",
+// CHECK-NEXT:                   "kind": "VarDecl",
+// CHECK-NEXT:                   "name": "b",
 // CHECK-NEXT:                   "type": {
-// CHECK-NEXT:                    "qualType": "const A &"
+// CHECK-NEXT:                    "qualType": "B"
 // CHECK-NEXT:                   }
 // CHECK-NEXT:                  },
 // CHECK-NEXT:                  "storageDuration": "automatic",
diff --git a/clang/test/AST/ast-dump-default-init.cpp b/clang/test/AST/ast-dump-default-init.cpp
index 15b29f04bf21..26864fbf1542 100644
--- a/clang/test/AST/ast-dump-default-init.cpp
+++ b/clang/test/AST/ast-dump-default-init.cpp
@@ -13,7 +13,7 @@ void test() {
 }
 // CHECK: -CXXDefaultInitExpr 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue has rewritten init
 // CHECK-NEXT:  `-ExprWithCleanups 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue
-// CHECK-NEXT:    `-MaterializeTemporaryExpr 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue extended by Field 0x{{[^ ]*}} 'a' 'const A &'
+// CHECK-NEXT:    `-MaterializeTemporaryExpr 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue extended by Var 0x{{[^ ]*}} 'b' 'B'
 // CHECK-NEXT:      `-ImplicitCastExpr 0x{{[^ ]*}} <{{.*}}> 'const A' <NoOp>
 // CHECK-NEXT:        `-CXXFunctionalCastExpr 0x{{[^ ]*}} <{{.*}}> 'A' functional cast to A <NoOp>
 // CHECK-NEXT:          `-InitListExpr 0x{{[^ ]*}} <{{.*}}> 'A'
diff --git a/clang/test/AST/ast-dump-expr-json.cpp b/clang/test/AST/ast-dump-expr-json.cpp
index 4b7365e554cb..dd2fe1fcf60c 100644
--- a/clang/test/AST/ast-dump-expr-json.cpp
+++ b/clang/test/AST/ast-dump-expr-json.cpp
@@ -2333,7 +2333,7 @@ void TestNonADLCall3() {
 // CHECK-NEXT:         "kind": "FunctionDecl",
 // CHECK-NEXT:         "name": "operator delete",
 // CHECK-NEXT:         "type": {
-// CHECK-NEXT:          "qualType": "void (void *) noexcept"
+// CHECK-NEXT:          "qualType": "void (void *, unsigned long) noexcept"
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "inner": [
diff --git a/clang/test/AST/ast-dump-expr.cpp b/clang/test/AST/ast-dump-expr.cpp
index 604868103dab..f9e9ee9d35dd 100644
--- a/clang/test/AST/ast-dump-expr.cpp
+++ b/clang/test/AST/ast-dump-expr.cpp
@@ -164,7 +164,7 @@ void UnaryExpressions(int *p) {
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:8> 'int *' lvalue ParmVar 0x{{[^ ]*}} 'p' 'int *'
 
   ::delete p;
-  // CHECK: CXXDeleteExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:12> 'void' global Function 0x{{[^ ]*}} 'operator delete' 'void (void *) noexcept'
+  // CHECK: CXXDeleteExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:12> 'void' global Function 0x{{[^ ]*}} 'operator delete' 'void (void *, unsigned long) noexcept'
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:12> 'int *' lvalue ParmVar 0x{{[^ ]*}} 'p' 'int *'
 
diff --git a/clang/test/AST/ast-dump-stmt-json.cpp b/clang/test/AST/ast-dump-stmt-json.cpp
index 667a12a01202..a473d17da942 100644
--- a/clang/test/AST/ast-dump-stmt-json.cpp
+++ b/clang/test/AST/ast-dump-stmt-json.cpp
@@ -994,7 +994,7 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:       "kind": "FunctionDecl",
 // CHECK-NEXT:       "name": "operator delete",
 // CHECK-NEXT:       "type": {
-// CHECK-NEXT:        "qualType": "void (void *) noexcept"
+// CHECK-NEXT:        "qualType": "void (void *, unsigned long) noexcept"
 // CHECK-NEXT:       }
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "inner": [
@@ -1369,7 +1369,7 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:       "kind": "FunctionDecl",
 // CHECK-NEXT:       "name": "operator delete",
 // CHECK-NEXT:       "type": {
-// CHECK-NEXT:        "qualType": "void (void *) noexcept"
+// CHECK-NEXT:        "qualType": "void (void *, unsigned long) noexcept"
 // CHECK-NEXT:       }
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "inner": [
@@ -1722,7 +1722,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:   "end": {}
 // CHECK-NEXT:  },
 // CHECK-NEXT:  "isImplicit": true,
-// CHECK-NEXT:  "isUsed": true,
 // CHECK-NEXT:  "name": "operator delete",
 // CHECK-NEXT:  "mangledName": "_ZdlPv",
 // CHECK-NEXT:  "type": {
@@ -1819,6 +1818,126 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:  },
 // CHECK-NEXT:  "isImplicit": true,
 // CHECK-NEXT:  "isUsed": true,
+// CHECK-NEXT:  "name": "operator delete",
+// CHECK-NEXT:  "mangledName": "_ZdlPvm",
+// CHECK-NEXT:  "type": {
+// CHECK-NEXT:   "qualType": "void (void *, unsigned long) noexcept"
+// CHECK-NEXT:  },
+// CHECK-NEXT:  "inner": [
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "ParmVarDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "void *"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "ParmVarDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "unsigned long"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "VisibilityAttr",
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "implicit": true,
+// CHECK-NEXT:    "visibility": "default"
+// CHECK-NEXT:   }
+// CHECK-NEXT:  ]
+// CHECK-NEXT: }
+
+// CHECK-NOT: {{^}}Dumping
+// CHECK:  "kind": "FunctionDecl",
+// CHECK-NEXT:  "loc": {},
+// CHECK-NEXT:  "range": {
+// CHECK-NEXT:   "begin": {},
+// CHECK-NEXT:   "end": {}
+// CHECK-NEXT:  },
+// CHECK-NEXT:  "isImplicit": true,
+// CHECK-NEXT:  "name": "operator delete",
+// CHECK-NEXT:  "mangledName": "_ZdlPvmSt11align_val_t",
+// CHECK-NEXT:  "type": {
+// CHECK-NEXT:   "qualType": "void (void *, unsigned long, std::align_val_t) noexcept"
+// CHECK-NEXT:  },
+// CHECK-NEXT:  "inner": [
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "ParmVarDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "void *"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "ParmVarDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "unsigned long"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "ParmVarDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "std::align_val_t"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "VisibilityAttr",
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "implicit": true,
+// CHECK-NEXT:    "visibility": "default"
+// CHECK-NEXT:   }
+// CHECK-NEXT:  ]
+// CHECK-NEXT: }
+
+// CHECK-NOT: {{^}}Dumping
+// CHECK:  "kind": "FunctionDecl",
+// CHECK-NEXT:  "loc": {},
+// CHECK-NEXT:  "range": {
+// CHECK-NEXT:   "begin": {},
+// CHECK-NEXT:   "end": {}
+// CHECK-NEXT:  },
+// CHECK-NEXT:  "isImplicit": true,
+// CHECK-NEXT:  "isUsed": true,
 // CHECK-NEXT:  "name": "operator delete[]",
 // CHECK-NEXT:  "mangledName": "_ZdaPv",
 // CHECK-NEXT:  "type": {
@@ -1907,6 +2026,125 @@ void TestDependentGenericSelectionExpr(Ty T) {
 
 
 // CHECK-NOT: {{^}}Dumping
+// CHECK:  "kind": "FunctionDecl",
+// CHECK-NEXT:  "loc": {},
+// CHECK-NEXT:  "range": {
+// CHECK-NEXT:   "begin": {},
+// CHECK-NEXT:   "end": {}
+// CHECK-NEXT:  },
+// CHECK-NEXT:  "isImplicit": true,
+// CHECK-NEXT:  "name": "operator delete[]",
+// CHECK-NEXT:  "mangledName": "_ZdaPvm",
+// CHECK-NEXT:  "type": {
+// CHECK-NEXT:   "qualType": "void (void *, unsigned long) noexcept"
+// CHECK-NEXT:  },
+// CHECK-NEXT:  "inner": [
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "ParmVarDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "void *"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "ParmVarDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "unsigned long"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "VisibilityAttr",
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "implicit": true,
+// CHECK-NEXT:    "visibility": "default"
+// CHECK-NEXT:   }
+// CHECK-NEXT:  ]
+// CHECK-NEXT: }
+
+// CHECK-NOT: {{^}}Dumping
+// CHECK:  "kind": "FunctionDecl",
+// CHECK-NEXT:  "loc": {},
+// CHECK-NEXT:  "range": {
+// CHECK-NEXT:   "begin": {},
+// CHECK-NEXT:   "end": {}
+// CHECK-NEXT:  },
+// CHECK-NEXT:  "isImplicit": true,
+// CHECK-NEXT:  "name": "operator delete[]",
+// CHECK-NEXT:  "mangledName": "_ZdaPvmSt11align_val_t",
+// CHECK-NEXT:  "type": {
+// CHECK-NEXT:   "qualType": "void (void *, unsigned long, std::align_val_t) noexcept"
+// CHECK-NEXT:  },
+// CHECK-NEXT:  "inner": [
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "ParmVarDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "void *"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "ParmVarDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "unsigned long"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "ParmVarDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "std::align_val_t"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "VisibilityAttr",
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "implicit": true,
+// CHECK-NEXT:    "visibility": "default"
+// CHECK-NEXT:   }
+// CHECK-NEXT:  ]
+// CHECK-NEXT: }
+
+// CHECK-NOT: {{^}}Dumping
 // CHECK:  "kind": "FunctionTemplateDecl",
 // CHECK-NEXT:  "loc": {
 // CHECK-NEXT:   "offset": 598,
diff --git a/clang/test/AST/attr-counted-by-late-parsed-struct-ptrs.c b/clang/test/AST/attr-counted-by-late-parsed-struct-ptrs.c
new file mode 100644
index 000000000000..a585a45eeff0
--- /dev/null
+++ b/clang/test/AST/attr-counted-by-late-parsed-struct-ptrs.c
@@ -0,0 +1,45 @@
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes %s -ast-dump | FileCheck %s
+
+#define __counted_by(f)  __attribute__((counted_by(f)))
+
+struct size_known {
+  int field;
+};
+
+//==============================================================================
+// __counted_by on struct member pointer in decl attribute position
+//==============================================================================
+
+struct on_member_pointer_complete_ty {
+  struct size_known *buf __counted_by(count);
+  int count;
+};
+// CHECK-LABEL: struct on_member_pointer_complete_ty definition
+// CHECK-NEXT: |-FieldDecl {{.*}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
+// CHECK-NEXT: `-FieldDecl {{.*}} referenced count 'int'
+
+struct on_pointer_anon_count {
+  struct size_known *buf __counted_by(count);
+  struct {
+    int count;
+  };
+};
+
+// CHECK-LABEL: struct on_pointer_anon_count definition
+// CHECK-NEXT:  |-FieldDecl {{.*}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
+// CHECK-NEXT:  |-RecordDecl {{.*}} struct definition
+// CHECK-NEXT:  | `-FieldDecl {{.*}} count 'int'
+// CHECK-NEXT:  |-FieldDecl {{.*}} implicit 'struct on_pointer_anon_count::(anonymous at {{.*}})'
+// CHECK-NEXT:  `-IndirectFieldDecl {{.*}} implicit referenced count 'int'
+// CHECK-NEXT:    |-Field {{.*}} '' 'struct on_pointer_anon_count::(anonymous at {{.*}})'
+// CHECK-NEXT:    `-Field {{.*}} 'count' 'int'
+
+//==============================================================================
+// __counted_by on struct member pointer in type attribute position
+//==============================================================================
+// TODO: Correctly parse counted_by as a type attribute. Currently it is parsed
+// as a declaration attribute and is **not** late parsed resulting in the `count`
+// field being unavailable.
+//
+// See `clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c` for test
+// cases.
diff --git a/clang/test/AST/attr-counted-by-struct-ptrs.c b/clang/test/AST/attr-counted-by-struct-ptrs.c
new file mode 100644
index 000000000000..79a453d239cd
--- /dev/null
+++ b/clang/test/AST/attr-counted-by-struct-ptrs.c
@@ -0,0 +1,117 @@
+// RUN: %clang_cc1 %s -ast-dump | FileCheck %s
+
+#define __counted_by(f)  __attribute__((counted_by(f)))
+
+struct size_unknown;
+struct size_known {
+  int field;
+};
+
+//==============================================================================
+// __counted_by on struct member pointer in decl attribute position
+//==============================================================================
+
+// CHECK-LABEL: RecordDecl {{.+}} struct on_member_pointer_complete_ty definition
+// CHECK-NEXT: |-FieldDecl {{.+}} referenced count 'int'
+// CHECK-NEXT: `-FieldDecl {{.+}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
+struct on_member_pointer_complete_ty {
+  int count;
+  struct size_known * buf __counted_by(count);
+};
+
+// CHECK-LABEL: RecordDecl {{.+}} struct on_pointer_anon_buf definition
+// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
+// CHECK-NEXT:  |-RecordDecl {{.+}} struct definition
+// CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
+// CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH:.+]])'
+// CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __counted_by(count)':'struct size_known *'
+// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH]])'
+// CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __counted_by(count)':'struct size_known *'
+struct on_pointer_anon_buf {
+  int count;
+  struct {
+    struct size_known *buf __counted_by(count);
+  };
+};
+
+struct on_pointer_anon_count {
+  struct {
+    int count;
+  };
+  struct size_known *buf __counted_by(count);
+};
+
+//==============================================================================
+// __counted_by on struct member pointer in type attribute position
+//==============================================================================
+// TODO: Correctly parse counted_by as a type attribute. Currently it is parsed
+// as a declaration attribute
+
+// CHECK-LABEL: RecordDecl {{.+}} struct on_member_pointer_complete_ty_ty_pos definition
+// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
+// CHECK-NEXT:  `-FieldDecl {{.+}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
+struct on_member_pointer_complete_ty_ty_pos {
+  int count;
+  struct size_known *__counted_by(count) buf;
+};
+
+// TODO: This should be forbidden but isn't due to counted_by being treated as a
+// declaration attribute. The attribute ends up on the outer most pointer
+// (allowed by sema) even though syntactically its supposed to be on the inner
+// pointer (would not allowed by sema due to pointee being a function type).
+// CHECK-LABEL: RecordDecl {{.+}} struct on_member_pointer_fn_ptr_ty_ty_pos_inner definition
+// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
+// CHECK-NEXT:  `-FieldDecl {{.+}} fn_ptr 'void (** __counted_by(count))(void)':'void (**)(void)'
+struct on_member_pointer_fn_ptr_ty_ty_pos_inner {
+  int count;
+  void (* __counted_by(count) * fn_ptr)(void);
+};
+
+// FIXME: The generated AST here is wrong. The attribute should be on the inner
+// pointer.
+// CHECK-LABEL: RecordDecl {{.+}} struct on_nested_pointer_inner definition
+// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
+// CHECK-NEXT:  `-FieldDecl {{.+}} buf 'struct size_known ** __counted_by(count)':'struct size_known **'
+struct on_nested_pointer_inner {
+  int count;
+  // TODO: This should be disallowed because in the `-fbounds-safety` model
+  // `__counted_by` can only be nested when used in function parameters.
+  struct size_known *__counted_by(count) *buf;
+};
+
+// CHECK-LABEL: RecordDecl {{.+}} struct on_nested_pointer_outer definition
+// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
+// CHECK-NEXT:  `-FieldDecl {{.+}} buf 'struct size_known ** __counted_by(count)':'struct size_known **'
+struct on_nested_pointer_outer {
+  int count;
+  struct size_known **__counted_by(count) buf;
+};
+
+// CHECK-LABEL: RecordDecl {{.+}} struct on_pointer_anon_buf_ty_pos definition
+// CHECK-NEXT:  |-FieldDecl {{.+}} referenced count 'int'
+// CHECK-NEXT:  |-RecordDecl {{.+}} struct definition
+// CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
+// CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2:.+]])'
+// CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __counted_by(count)':'struct size_known *'
+// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2]])'
+// CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __counted_by(count)':'struct size_known *'
+struct on_pointer_anon_buf_ty_pos {
+  int count;
+  struct {
+    struct size_known * __counted_by(count) buf;
+  };
+};
+
+// CHECK-LABEL: RecordDecl {{.+}} struct on_pointer_anon_count_ty_pos definition
+// CHECK-NEXT:  |-RecordDecl {{.+}} struct definition
+// CHECK-NEXT:  | `-FieldDecl {{.+}} count 'int'
+// CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3:.+]])'
+// CHECK-NEXT:  |-IndirectFieldDecl {{.+}} implicit referenced count 'int'
+// CHECK-NEXT:  | |-Field {{.+}} '' 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3]])'
+// CHECK-NEXT:  | `-Field {{.+}} 'count' 'int'
+struct on_pointer_anon_count_ty_pos {
+  struct {
+    int count;
+  };
+  struct size_known *__counted_by(count) buf;
+};
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp
index 632a82eb0d8d..25776870dd3a 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp
@@ -216,3 +216,76 @@ void foo() {
 }
 
 } // namespace conditional_op
+
+namespace local_assignment_basic {
+
+RefCountable *provide_ref_cntbl();
+
+void foo(RefCountable* a) {
+  RefCountable* b = a;
+  // expected-warning@-1{{Local variable 'b' is uncounted and unsafe [alpha.webkit.UncountedLocalVarsChecker]}}
+  if (b->trivial())
+    b = provide_ref_cntbl();
+}
+
+void bar(RefCountable* a) {
+  RefCountable* b;
+  // expected-warning@-1{{Local variable 'b' is uncounted and unsafe [alpha.webkit.UncountedLocalVarsChecker]}}
+  b = provide_ref_cntbl();
+}
+
+void baz() {
+  RefPtr a = provide_ref_cntbl();
+  {
+    RefCountable* b = a.get();
+    // expected-warning@-1{{Local variable 'b' is uncounted and unsafe [alpha.webkit.UncountedLocalVarsChecker]}}
+    b = provide_ref_cntbl();
+  }
+}
+
+} // namespace local_assignment_basic
+
+namespace local_assignment_to_parameter {
+
+RefCountable *provide_ref_cntbl();
+void someFunction();
+
+void foo(RefCountable* a) {
+  a = provide_ref_cntbl();
+  // expected-warning@-1{{Assignment to an uncounted parameter 'a' is unsafe [alpha.webkit.UncountedLocalVarsChecker]}}
+  someFunction();
+  a->method();
+}
+
+} // namespace local_assignment_to_parameter
+
+namespace local_assignment_to_static_local {
+
+RefCountable *provide_ref_cntbl();
+void someFunction();
+
+void foo() {
+  static RefCountable* a = nullptr;
+  // expected-warning@-1{{Static local variable 'a' is uncounted and unsafe [alpha.webkit.UncountedLocalVarsChecker]}}
+  a = provide_ref_cntbl();
+  someFunction();
+  a->method();
+}
+
+} // namespace local_assignment_to_static_local
+
+namespace local_assignment_to_global {
+
+RefCountable *provide_ref_cntbl();
+void someFunction();
+
+RefCountable* g_a = nullptr;
+// expected-warning@-1{{Global variable 'local_assignment_to_global::g_a' is uncounted and unsafe [alpha.webkit.UncountedLocalVarsChecker]}}
+
+void foo() {
+  g_a = provide_ref_cntbl();
+  someFunction();
+  g_a->method();
+}
+
+} // namespace local_assignment_to_global
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
index 96986631726f..a98c6eb9c84d 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
@@ -231,6 +231,18 @@ public:
   void method();
   void someFunction();
   int otherFunction();
+  unsigned recursiveTrivialFunction(int n) { return !n ? 1 : recursiveTrivialFunction(n - 1);  }
+  unsigned recursiveComplexFunction(int n) { return !n ? otherFunction() : recursiveComplexFunction(n - 1);  }
+  unsigned mutuallyRecursiveFunction1(int n) { return n < 0 ? 1 : (n % 2 ? mutuallyRecursiveFunction2(n - 2) : mutuallyRecursiveFunction1(n - 1)); }
+  unsigned mutuallyRecursiveFunction2(int n) { return n < 0 ? 1 : (n % 3 ? mutuallyRecursiveFunction2(n - 3) : mutuallyRecursiveFunction1(n - 2)); }
+  unsigned mutuallyRecursiveFunction3(int n) { return n < 0 ? 1 : (n % 5 ? mutuallyRecursiveFunction3(n - 5) : mutuallyRecursiveFunction4(n - 3)); }
+  unsigned mutuallyRecursiveFunction4(int n) { return n < 0 ? 1 : (n % 7 ? otherFunction() : mutuallyRecursiveFunction3(n - 3)); }
+  unsigned recursiveFunction5(unsigned n) { return n > 100 ? 2 : (n % 2 ? recursiveFunction5(n + 1) : recursiveFunction6(n + 2)); }
+  unsigned recursiveFunction6(unsigned n) { return n > 100 ? 3 : (n % 2 ? recursiveFunction6(n % 7) : recursiveFunction7(n % 5)); }
+  unsigned recursiveFunction7(unsigned n) { return n > 100 ? 5 : recursiveFunction7(n * 5); }
+
+  void mutuallyRecursive8() { mutuallyRecursive9(); someFunction(); }
+  void mutuallyRecursive9() { mutuallyRecursive8(); }
 
   int trivial1() { return 123; }
   float trivial2() { return 0.3; }
@@ -498,6 +510,24 @@ public:
     RefCounted::singleton().trivial18(); // no-warning
     RefCounted::singleton().someFunction(); // no-warning
 
+    getFieldTrivial().recursiveTrivialFunction(7); // no-warning
+    getFieldTrivial().recursiveComplexFunction(9);
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().mutuallyRecursiveFunction1(11); // no-warning
+    getFieldTrivial().mutuallyRecursiveFunction2(13); // no-warning
+    getFieldTrivial().mutuallyRecursiveFunction3(17);
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().mutuallyRecursiveFunction4(19);
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().recursiveFunction5(23); // no-warning
+    getFieldTrivial().recursiveFunction6(29); // no-warning
+    getFieldTrivial().recursiveFunction7(31); // no-warning
+
+    getFieldTrivial().mutuallyRecursive8();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().mutuallyRecursive9();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+
     getFieldTrivial().someFunction();
     // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
     getFieldTrivial().nonTrivial1();
diff --git a/clang/test/Analysis/cert/pos34-c-fp-suppression.cpp b/clang/test/Analysis/cert/pos34-c-fp-suppression.cpp
deleted file mode 100644
index d982fcb8a1ba..000000000000
--- a/clang/test/Analysis/cert/pos34-c-fp-suppression.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-// RUN: %clang_analyze_cc1 \
-// RUN:  -analyzer-checker=alpha.security.cert.pos.34c\
-// RUN:  -verify %s
-
-#include "../Inputs/system-header-simulator.h"
-void free(void *memblock);
-void *malloc(size_t size);
-int putenv(char *);
-int rand();
-
-namespace test_auto_var_used_good {
-
-extern char *ex;
-int test_extern() {
-  return putenv(ex); // no-warning: extern storage class.
-}
-
-void foo(void) {
-  char *buffer = (char *)"huttah!";
-  if (rand() % 2 == 0) {
-    buffer = (char *)malloc(5);
-    strcpy(buffer, "woot");
-  }
-  putenv(buffer);
-}
-
-void bar(void) {
-  char *buffer = (char *)malloc(5);
-  strcpy(buffer, "woot");
-
-  if (rand() % 2 == 0) {
-    free(buffer);
-    buffer = (char *)"blah blah blah";
-  }
-  putenv(buffer);
-}
-
-void baz() {
-  char env[] = "NAME=value";
-  // TODO: False Positive
-  putenv(env);
-  // expected-warning@-1 {{The 'putenv' function should not be called with arguments that have automatic storage}}
-
-  /*
-    DO SOMETHING
-  */
-
-  putenv((char *)"NAME=anothervalue");
-}
-
-} // namespace test_auto_var_used_good
diff --git a/clang/test/Analysis/cert/pos34-c.cpp b/clang/test/Analysis/cert/pos34-c.cpp
deleted file mode 100644
index f2bd7b393d88..000000000000
--- a/clang/test/Analysis/cert/pos34-c.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-// RUN: %clang_analyze_cc1 \
-// RUN:  -analyzer-checker=alpha.security.cert.pos.34c\
-// RUN:  -verify %s
-
-// Examples from the CERT rule's page.
-// https://wiki.sei.cmu.edu/confluence/x/6NYxBQ
-
-#include "../Inputs/system-header-simulator.h"
-void free(void *memblock);
-void *malloc(size_t size);
-int putenv(char *);
-int snprintf(char *str, size_t size, const char *format, ...);
-
-namespace test_auto_var_used_bad {
-
-int volatile_memory1(const char *var) {
-  char env[1024];
-  int retval = snprintf(env, sizeof(env), "TEST=%s", var);
-  if (retval < 0 || (size_t)retval >= sizeof(env)) {
-    /* Handle error */
-  }
-
-  return putenv(env);
-  // expected-warning@-1 {{The 'putenv' function should not be called with arguments that have automatic storage}}
-}
-
-} // namespace test_auto_var_used_bad
-
-namespace test_auto_var_used_good {
-
-int test_static(const char *var) {
-  static char env[1024];
-
-  int retval = snprintf(env, sizeof(env), "TEST=%s", var);
-  if (retval < 0 || (size_t)retval >= sizeof(env)) {
-    /* Handle error */
-  }
-
-  return putenv(env);
-}
-
-int test_heap_memory(const char *var) {
-  static char *oldenv;
-  const char *env_format = "TEST=%s";
-  const size_t len = strlen(var) + strlen(env_format);
-  char *env = (char *)malloc(len);
-  if (env == NULL) {
-    return -1;
-  }
-  if (putenv(env) != 0) { // no-warning: env was dynamically allocated.
-    free(env);
-    return -1;
-  }
-  if (oldenv != NULL) {
-    free(oldenv); /* avoid memory leak */
-  }
-  oldenv = env;
-  return 0;
-}
-
-} // namespace test_auto_var_used_good
diff --git a/clang/test/Analysis/cxx-uninitialized-object.cpp b/clang/test/Analysis/cxx-uninitialized-object.cpp
index e3fa8ae8d7f2..aee0dae15fbf 100644
--- a/clang/test/Analysis/cxx-uninitialized-object.cpp
+++ b/clang/test/Analysis/cxx-uninitialized-object.cpp
@@ -1114,27 +1114,27 @@ void fCXX11MemberInitTest1() {
   CXX11MemberInitTest1();
 }
 
+#ifdef PEDANTIC
 struct CXX11MemberInitTest2 {
   struct RecordType {
-    // TODO: we'd expect the note: {{uninitialized field 'this->rec.a'}}
-    int a; // no-note
-    // TODO: we'd expect the note: {{uninitialized field 'this->rec.b'}}
-    int b; // no-note
+    int a; // expected-note {{uninitialized field 'this->a'}}
+    int b; // expected-note {{uninitialized field 'this->b'}}
 
     RecordType(int) {}
   };
 
-  RecordType rec = RecordType(int());
+  RecordType rec = RecordType(int()); // expected-warning {{2 uninitialized fields}}
   int dontGetFilteredByNonPedanticMode = 0;
 
   CXX11MemberInitTest2() {}
 };
 
 void fCXX11MemberInitTest2() {
-  // TODO: we'd expect the warning: {{2 uninitializeds field}}
   CXX11MemberInitTest2(); // no-warning
 }
 
+#endif // PEDANTIC
+
 //===----------------------------------------------------------------------===//
 // "Esoteric" primitive type tests.
 //===----------------------------------------------------------------------===//
diff --git a/clang/test/Analysis/cxxnewexpr-callback.cpp b/clang/test/Analysis/cxxnewexpr-callback.cpp
index fe7a9fffad93..7df58cfa9ca2 100644
--- a/clang/test/Analysis/cxxnewexpr-callback.cpp
+++ b/clang/test/Analysis/cxxnewexpr-callback.cpp
@@ -9,7 +9,7 @@ void free(void *);
 } // namespace std
 
 void *operator new(size_t size) { return std::malloc(size); }
-void operator delete(void *ptr) { std::free(ptr); }
+void operator delete(void *ptr, size_t size) { std::free(ptr); }
 
 struct S {
   S() {}
@@ -49,7 +49,7 @@ void test() {
 // CHECK-NEXT: PostCall (operator delete)
 }
 
-void operator delete(void *ptr) {
+void operator delete(void *ptr, size_t size) {
   std::free(ptr);
 // CHECK-NO-INLINE-NEXT: PreCall (std::free)
 // CHECK-NO-INLINE-NEXT: PostCall (std::free)
diff --git a/clang/test/Analysis/lifetime-extended-regions.cpp b/clang/test/Analysis/lifetime-extended-regions.cpp
index 4e98bd4b0403..524f4e0c400d 100644
--- a/clang/test/Analysis/lifetime-extended-regions.cpp
+++ b/clang/test/Analysis/lifetime-extended-regions.cpp
@@ -120,11 +120,11 @@ void aggregateWithReferences() {
   clang_analyzer_dump(viaReference);    // expected-warning-re {{&lifetime_extended_object{RefAggregate, viaReference, S{{[0-9]+}}} }}
   clang_analyzer_dump(viaReference.rx); // expected-warning-re {{&lifetime_extended_object{int, viaReference, S{{[0-9]+}}} }}
   clang_analyzer_dump(viaReference.ry); // expected-warning-re {{&lifetime_extended_object{Composite, viaReference, S{{[0-9]+}}} }}
-
-  // clang does not currently implement extending lifetime of object bound to reference members of aggregates,
-  // that are created from default member initializer (see `warn_unsupported_lifetime_extension` from `-Wdangling`)
-  RefAggregate defaultInitExtended{i}; // clang-bug does not extend `Composite`
-  clang_analyzer_dump(defaultInitExtended.ry); // expected-warning {{Unknown }}
+  
+  // The lifetime lifetime of object bound to reference members of aggregates,
+  // that are created from default member initializer was extended.
+  RefAggregate defaultInitExtended{i};
+  clang_analyzer_dump(defaultInitExtended.ry); // expected-warning-re {{&lifetime_extended_object{Composite, defaultInitExtended, S{{[0-9]+}}} }}
 }
 
 void lambda() {
diff --git a/clang/test/Analysis/putenv-stack-array.c b/clang/test/Analysis/putenv-stack-array.c
new file mode 100644
index 000000000000..fbbf93259ab8
--- /dev/null
+++ b/clang/test/Analysis/putenv-stack-array.c
@@ -0,0 +1,70 @@
+// RUN: %clang_analyze_cc1 \
+// RUN:  -analyzer-checker=alpha.security.PutenvStackArray \
+// RUN:  -verify %s
+
+#include "Inputs/system-header-simulator.h"
+void free(void *);
+void *malloc(size_t);
+int putenv(char *);
+int snprintf(char *, size_t, const char *, ...);
+
+int test_auto_var(const char *var) {
+  char env[1024];
+  (void)snprintf(env, sizeof(env), "TEST=%s", var);
+  return putenv(env); // expected-warning{{The 'putenv' function should not be called with arrays that have automatic storage}}
+}
+
+int test_static_var(const char *var) {
+  static char env[1024];
+  (void)snprintf(env, sizeof(env), "TEST=%s", var);
+  return putenv(env); // no-warning: static array is used
+}
+
+void test_heap_memory(const char *var) {
+  const char *env_format = "TEST=%s";
+  const size_t len = strlen(var) + strlen(env_format);
+  char *env = (char *)malloc(len);
+  if (env == NULL)
+    return;
+  if (putenv(env) != 0) // no-warning: env was dynamically allocated.
+    free(env);
+}
+
+typedef struct {
+  int A;
+  char Env[1024];
+} Mem;
+
+int test_auto_var_struct() {
+  Mem mem;
+  return putenv(mem.Env); // expected-warning{{The 'putenv' function should not be called with}}
+}
+
+int test_auto_var_subarray() {
+  char env[1024];
+  return putenv(env + 100); // expected-warning{{The 'putenv' function should not be called with}}
+}
+
+int test_constant() {
+  char *env = "TEST";
+  return putenv(env); // no-warning: data is not on the stack
+}
+
+extern char *ext_env;
+int test_extern() {
+  return putenv(ext_env); // no-warning: extern storage class.
+}
+
+void test_auto_var_reset() {
+  char env[] = "NAME=value";
+  putenv(env); // expected-warning{{The 'putenv' function should not be called with}}
+  // ... (do something)
+  // Even cases like this are likely a bug:
+  // It looks like that if one string was passed to putenv,
+  // it should not be deallocated at all, because when reading the
+  // environment variable a pointer into this string is returned.
+  // In this case, if another (or the same) thread reads variable "NAME"
+  // at this point and does not copy the returned string, the data may
+  // become invalid.
+  putenv((char *)"NAME=anothervalue");
+}
diff --git a/clang/test/Analysis/setgid-setuid-order-notes.c b/clang/test/Analysis/setgid-setuid-order-notes.c
new file mode 100644
index 000000000000..03402413581c
--- /dev/null
+++ b/clang/test/Analysis/setgid-setuid-order-notes.c
@@ -0,0 +1,73 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,security.SetgidSetuidOrder -analyzer-output=text -verify %s
+
+typedef int uid_t;
+typedef int gid_t;
+
+int setuid(uid_t);
+int setgid(gid_t);
+
+uid_t getuid();
+gid_t getgid();
+
+
+
+void test_note_1() {
+  if (setuid(getuid()) == -1) // expected-note{{Assuming the condition is false}} \
+                              // expected-note{{Taking false branch}}
+    return;
+  if (setuid(getuid()) == -1) // expected-note{{Call to 'setuid' found here that removes superuser privileges}} \
+                              // expected-note{{Assuming the condition is false}} \
+                              // expected-note{{Taking false branch}}
+    return;
+  if (setgid(getgid()) == -1) // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}} \
+                              // expected-note{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+    return;
+}
+
+void test_note_2() {
+  if (setuid(getuid()) == -1) // expected-note{{Call to 'setuid' found here that removes superuser privileges}} \
+                              // expected-note 2 {{Assuming the condition is false}} \
+                              // expected-note 2 {{Taking false branch}}
+    return;
+  if (setgid(getgid()) == -1) // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}} \
+                              // expected-note{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}} \
+                              // expected-note{{Assuming the condition is false}} \
+                              // expected-note{{Taking false branch}}
+    return;
+  if (setuid(getuid()) == -1) // expected-note{{Call to 'setuid' found here that removes superuser privileges}} \
+                              // expected-note{{Assuming the condition is false}} \
+                              // expected-note{{Taking false branch}}
+    return;
+  if (setgid(getgid()) == -1) // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}} \
+                              // expected-note{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+    return;
+}
+
+int f_setuid() {
+  return setuid(getuid()); // expected-note{{Call to 'setuid' found here that removes superuser privileges}}
+}
+
+int f_setgid() {
+  return setgid(getgid()); // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}} \
+                           // expected-note{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+}
+
+void test_note_3() {
+  if (f_setuid() == -1) // expected-note{{Assuming the condition is false}} \
+                        // expected-note{{Calling 'f_setuid'}} \
+                        // expected-note{{Returning from 'f_setuid'}} \
+                        // expected-note{{Taking false branch}}
+    return;
+  if (f_setgid() == -1) // expected-note{{Calling 'f_setgid'}}
+    return;
+}
+
+void test_note_4() {
+  if (setuid(getuid()) == 0) {   // expected-note{{Assuming the condition is true}} \
+                                 // expected-note{{Call to 'setuid' found here that removes superuser privileges}} \
+                                 // expected-note{{Taking true branch}}
+    if (setgid(getgid()) == 0) { // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}} \
+                                 // expected-note{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+    }
+  }
+}
diff --git a/clang/test/Analysis/setgid-setuid-order.c b/clang/test/Analysis/setgid-setuid-order.c
new file mode 100644
index 000000000000..1c411aa6a27b
--- /dev/null
+++ b/clang/test/Analysis/setgid-setuid-order.c
@@ -0,0 +1,257 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,security.SetgidSetuidOrder -verify %s
+
+typedef int uid_t;
+typedef int gid_t;
+
+int setuid(uid_t);
+int setgid(gid_t);
+int seteuid(uid_t);
+int setegid(gid_t);
+int setreuid(uid_t, uid_t);
+int setregid(gid_t, gid_t);
+int setresuid(uid_t, uid_t, uid_t);
+int setresgid(gid_t, gid_t, gid_t);
+
+uid_t getuid();
+gid_t getgid();
+
+
+
+void correct_order() {
+  // A correct revocation sequence starts here.
+  if (setgid(getgid()) == -1)
+    return;
+  if (setuid(getuid()) == -1)
+    return;
+  // No warning for the following setgid statement.
+  // The previous setgid and setuid calls are a correct privilege revocation
+  // sequence. The checker does not care about the following statements (except
+  // if a wrong setuid-setgid sequence follows again).
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+void incorrect_after_correct() {
+  if (setgid(getgid()) == -1)
+    return;
+  if (setuid(getuid()) == -1)
+    return;
+  // Incorrect sequence starts here.
+  if (setuid(getuid()) == -1)
+    return;
+  if (setgid(getgid()) == -1) // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+    return;
+}
+
+void incorrect_order() {
+  if (setuid(getuid()) == -1)
+    return;
+  if (setgid(getgid()) == -1) // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+    return;
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+void warn_at_second_time() {
+  if (setuid(getuid()) == -1)
+    return;
+  if (setgid(getgid()) == -1) // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+    return;
+  if (setuid(getuid()) == -1)
+    return;
+  if (setgid(getgid()) == -1) // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+    return;
+}
+
+uid_t f_uid();
+gid_t f_gid();
+
+void setuid_other() {
+  if (setuid(f_uid()) == -1)
+    return;
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+void setgid_other() {
+  if (setuid(getuid()) == -1)
+    return;
+  if (setgid(f_gid()) == -1)
+    return;
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+void setuid_other_between() {
+  if (setuid(getuid()) == -1)
+    return;
+  if (setuid(f_uid()) == -1)
+    return;
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+void setgid_with_getuid() {
+  if (setuid(getuid()) == -1)
+    return;
+  // add a clang-tidy check for this case?
+  if (setgid(getuid()) == -1)
+    return;
+}
+
+void setuid_with_getgid() {
+  // add a clang-tidy check for this case?
+  if (setuid(getgid()) == -1)
+    return;
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+int f_setuid() {
+  return setuid(getuid());
+}
+
+int f_setgid() {
+  return setgid(getgid()); // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+}
+
+void function_calls() {
+  if (f_setuid() == -1)
+    return;
+  if (f_setgid() == -1)
+    return;
+}
+
+void seteuid_between() {
+  if (setuid(getuid()) == -1)
+    return;
+  if (seteuid(getuid()) == -1)
+    return;
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+void setegid_between() {
+  if (setuid(getuid()) == -1)
+    return;
+  if (setegid(getgid()) == -1)
+    return;
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+void setreuid_between() {
+  if (setuid(getuid()) == -1)
+    return;
+  if (setreuid(getuid(), getuid()) == -1)
+    return;
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+void setregid_between() {
+  if (setuid(getuid()) == -1)
+    return;
+  if (setregid(getgid(), getgid()) == -1)
+    return;
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+void setresuid_between() {
+  if (setuid(getuid()) == -1)
+    return;
+  if (setresuid(getuid(), getuid(), getuid()) == -1)
+    return;
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+void setresgid_between() {
+  if (setuid(getuid()) == -1)
+    return;
+  if (setresgid(getgid(), getgid(), getgid()) == -1)
+    return;
+  if (setgid(getgid()) == -1)
+    return;
+}
+
+void getgid_getuid_between() {
+  if (setuid(getuid()) == -1)
+    return;
+  (void)getgid();
+  (void)getuid();
+  if (setgid(getgid()) == -1) // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+    return;
+}
+
+void stored_getgid_getuid() {
+  // possible future improvement: detect this case
+  uid_t u = getuid();
+  gid_t g = getgid();
+  if (setuid(u) == -1)
+    return;
+  if (setgid(g) == -1) // no warning
+    return;
+}
+
+void f_extern();
+
+void other_unknown_function_between() {
+  if (setuid(getuid()) == -1)
+    return;
+  f_extern();
+  if (setgid(getgid()) == -1) // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+    return;
+}
+
+void setuid_error_case() {
+  if (setuid(getuid()) == -1) {
+    // No warning if we know that the first setuid call has failed.
+    (void)setgid(getgid());
+    return;
+  }
+  (void)setgid(getgid()); // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+}
+
+void setuid_success_case() {
+  if (setuid(getuid()) == 0) {
+    if (setgid(getgid()) == 0) { // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+    }
+  }
+}
+
+void incorrect_order_compare_zero() {
+  if (setuid(getuid()) != 0)
+    return;
+  (void)setgid(getgid()); // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+}
+
+void setuid_error_case_compare_zero() {
+  if (setuid(getuid()) != 0) {
+    // No warning if we know that the first setuid call has failed.
+    (void)setgid(getgid());
+    return;
+  }
+}
+
+void incorrect_order_compare_other() {
+  if (setuid(getuid()) == -2) {
+    // This is a case for improvement:
+    // The checker does not recognize that this is an invalid error check,
+    // but this is really another type of bug not related to this checker.
+    (void)setgid(getgid()); // warning should appear here
+    return;
+  }
+  if (setgid(getgid()) == -2) { // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+    return;
+  }
+}
+
+const int FAIL = -1;
+
+void incorrect_order_compare_var() {
+  if (setuid(getuid()) == FAIL)
+    return;
+  (void)setgid(getgid()); // expected-warning{{A 'setgid(getgid())' call following a 'setuid(getuid())' call is likely to fail}}
+}
diff --git a/clang/test/CXX/basic/basic.stc/basic.stc.dynamic/basic.stc.dynamic.deallocation/p2.cpp b/clang/test/CXX/basic/basic.stc/basic.stc.dynamic/basic.stc.dynamic.deallocation/p2.cpp
index 9e3210c6650f..706549f56c52 100644
--- a/clang/test/CXX/basic/basic.stc/basic.stc.dynamic/basic.stc.dynamic.deallocation/p2.cpp
+++ b/clang/test/CXX/basic/basic.stc/basic.stc.dynamic/basic.stc.dynamic.deallocation/p2.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++1z -fsized-deallocation -fexceptions -verify %s
+// RUN: %clang_cc1 -std=c++1z -fexceptions -verify %s
 
 using size_t = decltype(sizeof(0));
 
diff --git a/clang/test/CXX/drs/cwg16xx.cpp b/clang/test/CXX/drs/cwg16xx.cpp
index cf6b45ceabf2..82ef871939d2 100644
--- a/clang/test/CXX/drs/cwg16xx.cpp
+++ b/clang/test/CXX/drs/cwg16xx.cpp
@@ -483,8 +483,6 @@ namespace cwg1696 { // cwg1696: 7
     const A &a = A(); // #cwg1696-D1-a
   };
   D1 d1 = {}; // #cwg1696-d1
-  // since-cxx14-warning@-1 {{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported; lifetime of temporary will end at the end of the full-expression}}
-  //   since-cxx14-note@#cwg1696-D1-a {{initializing field 'a' with default member initializer}}
 
   struct D2 {
     const A &a = A(); // #cwg1696-D2-a
diff --git a/clang/test/CXX/drs/cwg18xx.cpp b/clang/test/CXX/drs/cwg18xx.cpp
index 35615076a628..b71a81b62f81 100644
--- a/clang/test/CXX/drs/cwg18xx.cpp
+++ b/clang/test/CXX/drs/cwg18xx.cpp
@@ -206,19 +206,28 @@ namespace cwg1814 { // cwg1814: yes
 #endif
 }
 
-namespace cwg1815 { // cwg1815: no
+namespace cwg1815 { // cwg1815: 19
 #if __cplusplus >= 201402L
-  // FIXME: needs codegen test
-  struct A { int &&r = 0; }; // #cwg1815-A
+  struct A { int &&r = 0; };
   A a = {};
-  // since-cxx14-warning@-1 {{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported; lifetime of temporary will end at the end of the full-expression}} FIXME
-  //   since-cxx14-note@#cwg1815-A {{initializing field 'r' with default member initializer}}
 
   struct B { int &&r = 0; }; // #cwg1815-B
   // since-cxx14-error@-1 {{reference member 'r' binds to a temporary object whose lifetime would be shorter than the lifetime of the constructed object}}
   //   since-cxx14-note@#cwg1815-B {{initializing field 'r' with default member initializer}}
   //   since-cxx14-note@#cwg1815-b {{in implicit default constructor for 'cwg1815::B' first required here}}
   B b; // #cwg1815-b
+
+#if __cplusplus >= 201703L
+  struct C { const int &r = 0; };
+  constexpr C c = {}; // OK, since cwg1815
+  static_assert(c.r == 0);
+
+  constexpr int f() {
+    A a = {}; // OK, since cwg1815
+    return a.r;
+  }
+  static_assert(f() == 0);
+#endif
 #endif
 }
 
diff --git a/clang/test/CXX/drs/cwg28xx.cpp b/clang/test/CXX/drs/cwg28xx.cpp
index 696cd1b9c84e..8469a065ccaa 100644
--- a/clang/test/CXX/drs/cwg28xx.cpp
+++ b/clang/test/CXX/drs/cwg28xx.cpp
@@ -109,3 +109,74 @@ struct A {
 #endif
 
 } // namespace cwg2858
+
+namespace cwg2881 { // cwg2881: 19 tentatively ready 2024-04-19
+
+#if __cplusplus >= 202302L
+
+template <typename T> struct A : T {};
+template <typename T> struct B : T {};
+template <typename T> struct C : virtual T { C(T t) : T(t) {} };
+template <typename T> struct D : virtual T { D(T t) : T(t) {} };
+
+template <typename Ts>
+struct O1 : A<Ts>, B<Ts> {
+  using A<Ts>::operator();
+  using B<Ts>::operator();
+};
+
+template <typename Ts> struct O2 : protected Ts { // expected-note {{declared protected here}}
+  using Ts::operator();
+  O2(Ts ts) : Ts(ts) {}
+};
+
+template <typename Ts> struct O3 : private Ts { // expected-note {{declared private here}}
+  using Ts::operator();
+  O3(Ts ts) : Ts(ts) {}
+};
+
+// Not ambiguous because of virtual inheritance.
+template <typename Ts>
+struct O4 : C<Ts>, D<Ts> {
+  using C<Ts>::operator();
+  using D<Ts>::operator();
+  O4(Ts t) : Ts(t), C<Ts>(t), D<Ts>(t) {}
+};
+
+// This still has a public path to the lambda, and it's also not
+// ambiguous because of virtual inheritance.
+template <typename Ts>
+struct O5 : private C<Ts>, D<Ts> {
+  using C<Ts>::operator();
+  using D<Ts>::operator();
+  O5(Ts t) : Ts(t), C<Ts>(t), D<Ts>(t) {}
+};
+
+// This is only invalid if we call T's call operator.
+template <typename T, typename U>
+struct O6 : private T, U { // expected-note {{declared private here}}
+  using T::operator();
+  using U::operator();
+  O6(T t, U u) : T(t), U(u) {}
+};
+
+void f() {
+  int x;
+  auto L1 = [=](this auto&& self) { (void) &x; };
+  auto L2 = [&](this auto&& self) { (void) &x; };
+  O1<decltype(L1)>{L1, L1}(); // expected-error {{inaccessible due to ambiguity}}
+  O1<decltype(L2)>{L2, L2}(); // expected-error {{inaccessible due to ambiguity}}
+  O2{L1}(); // expected-error {{must derive publicly from the lambda}}
+  O3{L1}(); // expected-error {{must derive publicly from the lambda}}
+  O4{L1}();
+  O5{L1}();
+  O6 o{L1, L2};
+  o.decltype(L1)::operator()(); // expected-error {{must derive publicly from the lambda}}
+  o.decltype(L1)::operator()(); // No error here because we've already diagnosed this method.
+  o.decltype(L2)::operator()();
+}
+
+#endif
+
+} // namespace cwg2881
+
diff --git a/clang/test/CXX/drs/cwg292.cpp b/clang/test/CXX/drs/cwg292.cpp
index b05d3b92d627..a7bcbe6f5051 100644
--- a/clang/test/CXX/drs/cwg292.cpp
+++ b/clang/test/CXX/drs/cwg292.cpp
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
-// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
-// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
-// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
-// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
-// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
-// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK
+// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,CXX98-11
+// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,CXX98-11
+// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX14
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX14
+// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX14
+// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX14
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -disable-llvm-passes -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX14
 
 namespace cwg292 { // cwg292: 2.9
 
@@ -23,7 +23,8 @@ void f() {
 // CHECK:         invoke {{.*}} i32 @cwg292::g()()
 // CHECK-NEXT:           to {{.*}} unwind label %lpad
 // CHECK-LABEL: lpad:
-// CHECK:         call void @operator delete(void*)(ptr {{.*}} %[[CALL]])
+// CXX98-11:      call void @operator delete(void*)(ptr {{.*}} %[[CALL]])
+// SINCE-CXX14:   call void @operator delete(void*, unsigned long)(ptr {{.*}} %[[CALL]], i64 noundef 1)
 // CHECK-LABEL: eh.resume:
 // CHECK-LABEL: }
 
diff --git a/clang/test/CXX/expr/expr.unary/expr.new/p14.cpp b/clang/test/CXX/expr/expr.unary/expr.new/p14.cpp
index 6537cdcfeafa..d0b24c8fe47b 100644
--- a/clang/test/CXX/expr/expr.unary/expr.new/p14.cpp
+++ b/clang/test/CXX/expr/expr.unary/expr.new/p14.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++1z -fsized-deallocation -fexceptions %s -verify
+// RUN: %clang_cc1 -std=c++1z -fexceptions %s -verify
 
 using size_t = decltype(sizeof(0));
 namespace std { enum class align_val_t : size_t {}; }
diff --git a/clang/test/CXX/expr/expr.unary/expr.sizeof/p5-0x.cpp b/clang/test/CXX/expr/expr.unary/expr.sizeof/p5-0x.cpp
index afd8ef05302f..19f90801df31 100644
--- a/clang/test/CXX/expr/expr.unary/expr.sizeof/p5-0x.cpp
+++ b/clang/test/CXX/expr/expr.unary/expr.sizeof/p5-0x.cpp
@@ -33,6 +33,6 @@ template<int Value> struct count_ints_2 {
 template<typename ...Types> // expected-note{{parameter pack 'Types' declared here}}
 struct count_types_2 {
   static const unsigned value = sizeof... Type; // expected-error{{missing parentheses around the size of parameter pack 'Type'}} \
-  // expected-error{{Type' does not refer to the name of a parameter pack; did you mean 'Types'?}}
+  // expected-error{{'Type' does not refer to the name of a parameter pack; did you mean 'Types'?}}
 };
 
diff --git a/clang/test/CXX/special/class.temporary/p6.cpp b/clang/test/CXX/special/class.temporary/p6.cpp
index 5554363cc69a..a6d2adfd1fd2 100644
--- a/clang/test/CXX/special/class.temporary/p6.cpp
+++ b/clang/test/CXX/special/class.temporary/p6.cpp
@@ -269,6 +269,40 @@ void init_capture_init_list() {
   // CHECK: }
 }
 
+void check_dr1815() { // dr1815: yes
+#if __cplusplus >= 201402L
+
+  struct A {
+    int &&r = 0;
+    ~A() {}
+  };
+
+  struct B {
+    A &&a = A{};
+    ~B() {}
+  };
+  B a = {};
+  
+  // CHECK: call {{.*}}block_scope_begin_function
+  extern void block_scope_begin_function();
+  extern void block_scope_end_function();
+  block_scope_begin_function();
+  {
+    // CHECK: call void @_ZZ12check_dr1815vEN1BD1Ev
+    // CHECK: call void @_ZZ12check_dr1815vEN1AD1Ev
+    B b = {};
+  }
+  // CHECK: call {{.*}}block_scope_end_function
+  block_scope_end_function();
+
+  // CHECK: call {{.*}}some_other_function
+  extern void some_other_function();
+  some_other_function();
+  // CHECK: call void @_ZZ12check_dr1815vEN1BD1Ev
+  // CHECK: call void @_ZZ12check_dr1815vEN1AD1Ev
+#endif
+}
+
 namespace P2718R0 {
 namespace basic {
 template <typename E> using T2 = std::list<E>;
diff --git a/clang/test/ClangScanDeps/response-file-clang-cl.c b/clang/test/ClangScanDeps/response-file-clang-cl.c
new file mode 100644
index 000000000000..b543231f4bb1
--- /dev/null
+++ b/clang/test/ClangScanDeps/response-file-clang-cl.c
@@ -0,0 +1,56 @@
+// Check that the scanner can adjust arguments by reading .rsp files in advance.
+
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+
+// First run the tests with a .cdb
+// RUN: sed -e "s|DIR|%/t|g" %t/cdb.json.template > %t/cdb.json
+// RUN: sed -e "s|DIR|%/t|g" %t/args_nested.template > %t/args_nested.rsp
+
+// RUN: cp %t/args_compilation.rsp %t/args.rsp
+// RUN: clang-scan-deps --compilation-database %t/cdb.json > %t/deps.json
+// RUN: cat %t/deps.json | sed 's:\\\\\?:/:g' | FileCheck -DPREFIX=%/t %s
+
+// RUN: cp %t/args_preprocess.rsp %t/args.rsp
+// RUN: clang-scan-deps --compilation-database %t/cdb.json > %t/deps.json
+// RUN: cat %t/deps.json | sed 's:\\\\\?:/:g' | FileCheck -DPREFIX=%/t %s
+
+
+// Now run the tests again with a in-place compilation database
+// RUN: cd %t
+
+// RUN: cp args_compilation.rsp args.rsp
+// RUN: clang-scan-deps -o deps.json -- %clang_cl @args.rsp
+// RUN: cat deps.json | sed 's:\\\\\?:/:g' | FileCheck -DPREFIX=%/t %s
+
+// RUN: cp args_preprocess.rsp args.rsp
+// RUN: clang-scan-deps -o deps.json -- %clang_cl @args.rsp
+// RUN: cat deps.json | sed 's:\\\\\?:/:g' | FileCheck -DPREFIX=%/t %s
+
+// Here we ensure that we got a qualified .obj with its full path, since that's what we're passing with /Fo
+// CHECK: [[PREFIX]]/tu.obj:
+
+//--- cdb.json.template
+[{
+  "file": "DIR/tu.cpp",
+  "directory": "DIR",
+  "command": "clang-cl @DIR/args.rsp"
+}]
+
+//--- args_compilation.rsp
+@args_nested.rsp
+/c
+
+//--- args_preprocess.rsp
+@args_nested.rsp
+/E
+
+//--- args_nested.template
+/I include
+tu.cpp
+/FoDIR/tu.obj
+
+//--- include/header.h
+
+//--- tu.cpp
+#include "header.h"
diff --git a/clang/test/CodeGen/SystemZ/sync-builtins-i128-8Al.c b/clang/test/CodeGen/SystemZ/sync-builtins-i128-8Al.c
index 76c9c0ebed2b..c678e9a9882f 100644
--- a/clang/test/CodeGen/SystemZ/sync-builtins-i128-8Al.c
+++ b/clang/test/CodeGen/SystemZ/sync-builtins-i128-8Al.c
@@ -7,21 +7,21 @@
 __int128 Ptr __attribute__((aligned(8)));
 
 __int128 f1() {
-// CHECK: warning: __sync builtin operation MUST have natural alignment (consider using __atomic). [-Wsync-alignment]
+// CHECK: warning: __sync builtin operation must have natural alignment (consider using __atomic)
   return __sync_fetch_and_add(&Ptr, 1);
 }
 
 __int128 f2() {
-// CHECK: warning: __sync builtin operation MUST have natural alignment (consider using __atomic). [-Wsync-alignment]
+// CHECK: warning: __sync builtin operation must have natural alignment (consider using __atomic)
   return __sync_sub_and_fetch(&Ptr, 1);
 }
 
 __int128 f3() {
-// CHECK: warning: __sync builtin operation MUST have natural alignment (consider using __atomic). [-Wsync-alignment]
+// CHECK: warning: __sync builtin operation must have natural alignment (consider using __atomic)
   return __sync_val_compare_and_swap(&Ptr, 0, 1);
 }
 
 void f4() {
-// CHECK: warning: __sync builtin operation MUST have natural alignment (consider using __atomic). [-Wsync-alignment]
+// CHECK: warning: __sync builtin operation must have natural alignment (consider using __atomic)
   __sync_lock_release(&Ptr);
 }
diff --git a/clang/test/CodeGen/X86/avx512er-builtins.c b/clang/test/CodeGen/X86/avx512er-builtins.c
deleted file mode 100644
index 11ec6aabec1e..000000000000
--- a/clang/test/CodeGen/X86/avx512er-builtins.c
+++ /dev/null
@@ -1,347 +0,0 @@
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512f -target-feature +avx512er -emit-llvm -o - -Wall | FileCheck %s
-
-
-#include <immintrin.h>
-
-__m512d test_mm512_rsqrt28_round_pd(__m512d a) {
-  // CHECK-LABEL: @test_mm512_rsqrt28_round_pd
-  // CHECK: @llvm.x86.avx512.rsqrt28.pd
-  return _mm512_rsqrt28_round_pd(a, _MM_FROUND_NO_EXC);
-}
-
-__m512d test_mm512_mask_rsqrt28_round_pd(__m512d s, __mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_mask_rsqrt28_round_pd
-  // CHECK: @llvm.x86.avx512.rsqrt28.pd
-  return _mm512_mask_rsqrt28_round_pd(s, m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512d test_mm512_maskz_rsqrt28_round_pd(__mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_maskz_rsqrt28_round_pd
-  // CHECK: @llvm.x86.avx512.rsqrt28.pd
-  return _mm512_maskz_rsqrt28_round_pd(m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512d test_mm512_rsqrt28_pd(__m512d a) {
-  // CHECK-LABEL: @test_mm512_rsqrt28_pd
-  // CHECK: @llvm.x86.avx512.rsqrt28.pd
-  return _mm512_rsqrt28_pd(a);
-}
-
-__m512d test_mm512_mask_rsqrt28_pd(__m512d s, __mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_mask_rsqrt28_pd
-  // CHECK: @llvm.x86.avx512.rsqrt28.pd
-  return _mm512_mask_rsqrt28_pd(s, m, a);
-}
-
-__m512d test_mm512_maskz_rsqrt28_pd(__mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_maskz_rsqrt28_pd
-  // CHECK: @llvm.x86.avx512.rsqrt28.pd
-  return _mm512_maskz_rsqrt28_pd(m, a);
-}
-
-__m512 test_mm512_rsqrt28_round_ps(__m512 a) {
-  // CHECK-LABEL: @test_mm512_rsqrt28_round_ps
-  // CHECK: @llvm.x86.avx512.rsqrt28.ps
-  return _mm512_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC);
-}
-
-__m512 test_mm512_mask_rsqrt28_round_ps(__m512 s, __mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_mask_rsqrt28_round_ps
-  // CHECK: @llvm.x86.avx512.rsqrt28.ps
-  return _mm512_mask_rsqrt28_round_ps(s, m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512 test_mm512_maskz_rsqrt28_round_ps(__mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_maskz_rsqrt28_round_ps
-  // CHECK: @llvm.x86.avx512.rsqrt28.ps
-  return _mm512_maskz_rsqrt28_round_ps(m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512 test_mm512_rsqrt28_ps(__m512 a) {
-  // CHECK-LABEL: @test_mm512_rsqrt28_ps
-  // CHECK: @llvm.x86.avx512.rsqrt28.ps
-  return _mm512_rsqrt28_ps(a);
-}
-
-__m512 test_mm512_mask_rsqrt28_ps(__m512 s, __mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_mask_rsqrt28_ps
-  // CHECK: @llvm.x86.avx512.rsqrt28.ps
-  return _mm512_mask_rsqrt28_ps(s, m, a);
-}
-
-__m512 test_mm512_maskz_rsqrt28_ps(__mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_maskz_rsqrt28_ps
-  // CHECK: @llvm.x86.avx512.rsqrt28.ps
-  return _mm512_maskz_rsqrt28_ps(m, a);
-}
-
-__m128 test_mm_rsqrt28_round_ss(__m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_rsqrt28_round_ss
-  // CHECK: @llvm.x86.avx512.rsqrt28.ss
-  return _mm_rsqrt28_round_ss(a, b, _MM_FROUND_NO_EXC);
-}
-
-__m128 test_mm_mask_rsqrt28_round_ss(__m128 s, __mmask16 m, __m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_mask_rsqrt28_round_ss
-  // CHECK: @llvm.x86.avx512.rsqrt28.ss
-  return _mm_mask_rsqrt28_round_ss(s, m, a, b, _MM_FROUND_NO_EXC);
-}
-
-__m128 test_mm_maskz_rsqrt28_round_ss(__mmask16 m, __m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_maskz_rsqrt28_round_ss
-  // CHECK: @llvm.x86.avx512.rsqrt28.ss
-  return _mm_maskz_rsqrt28_round_ss(m, a, b, _MM_FROUND_NO_EXC);
-}
-
-__m128 test_mm_rsqrt28_ss(__m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_rsqrt28_ss
-  // CHECK: @llvm.x86.avx512.rsqrt28.ss
-  return _mm_rsqrt28_ss(a, b);
-}
-
-__m128 test_mm_mask_rsqrt28_ss(__m128 s, __mmask16 m, __m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_mask_rsqrt28_ss
-  // CHECK: @llvm.x86.avx512.rsqrt28.ss
-  return _mm_mask_rsqrt28_ss(s, m, a, b);
-}
-
-__m128 test_mm_maskz_rsqrt28_ss(__mmask16 m, __m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_maskz_rsqrt28_ss
-  // CHECK: @llvm.x86.avx512.rsqrt28.ss
-  return _mm_maskz_rsqrt28_ss(m, a, b);
-}
-
-__m128d test_mm_rsqrt28_round_sd(__m128d a, __m128d b) {
-  // CHECK-LABEL: @test_mm_rsqrt28_round_sd
-  // CHECK: @llvm.x86.avx512.rsqrt28.sd
-  return _mm_rsqrt28_round_sd(a, b, _MM_FROUND_NO_EXC);
-}
-
-__m128d test_mm_mask_rsqrt28_round_sd(__m128d s, __mmask8 m, __m128d a, __m128d b) {
-  // CHECK-LABEL: @test_mm_mask_rsqrt28_round_sd
-  // CHECK: @llvm.x86.avx512.rsqrt28.sd
-  return _mm_mask_rsqrt28_round_sd(s, m, a, b, _MM_FROUND_NO_EXC);
-}
-
-__m128d test_mm_maskz_rsqrt28_round_sd(__mmask8 m, __m128d a, __m128d b) {
-  // CHECK-LABEL: @test_mm_maskz_rsqrt28_round_sd
-  // CHECK: @llvm.x86.avx512.rsqrt28.sd
-  return _mm_maskz_rsqrt28_round_sd(m, a, b, _MM_FROUND_NO_EXC);
-}
-
-__m512d test_mm512_rcp28_round_pd(__m512d a) {
-  // CHECK-LABEL: @test_mm512_rcp28_round_pd
-  // CHECK: @llvm.x86.avx512.rcp28.pd
-  return _mm512_rcp28_round_pd(a, _MM_FROUND_NO_EXC);
-}
-
-__m512d test_mm512_mask_rcp28_round_pd(__m512d s, __mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_mask_rcp28_round_pd
-  // CHECK: @llvm.x86.avx512.rcp28.pd
-  return _mm512_mask_rcp28_round_pd(s, m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512d test_mm512_maskz_rcp28_round_pd(__mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_maskz_rcp28_round_pd
-  // CHECK: @llvm.x86.avx512.rcp28.pd
-  return _mm512_maskz_rcp28_round_pd(m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512d test_mm512_rcp28_pd(__m512d a) {
-  // CHECK-LABEL: @test_mm512_rcp28_pd
-  // CHECK: @llvm.x86.avx512.rcp28.pd
-  return _mm512_rcp28_pd(a);
-}
-
-__m512d test_mm512_mask_rcp28_pd(__m512d s, __mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_mask_rcp28_pd
-  // CHECK: @llvm.x86.avx512.rcp28.pd
-  return _mm512_mask_rcp28_pd(s, m, a);
-}
-
-__m512d test_mm512_maskz_rcp28_pd(__mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_maskz_rcp28_pd
-  // CHECK: @llvm.x86.avx512.rcp28.pd
-  return _mm512_maskz_rcp28_pd(m, a);
-}
-
-__m512 test_mm512_rcp28_round_ps(__m512 a) {
-  // CHECK-LABEL: @test_mm512_rcp28_round_ps
-  // CHECK: @llvm.x86.avx512.rcp28.ps
-  return _mm512_rcp28_round_ps(a, _MM_FROUND_NO_EXC);
-}
-
-__m512 test_mm512_mask_rcp28_round_ps(__m512 s, __mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_mask_rcp28_round_ps
-  // CHECK: @llvm.x86.avx512.rcp28.ps
-  return _mm512_mask_rcp28_round_ps(s, m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512 test_mm512_maskz_rcp28_round_ps(__mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_maskz_rcp28_round_ps
-  // CHECK: @llvm.x86.avx512.rcp28.ps
-  return _mm512_maskz_rcp28_round_ps(m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512 test_mm512_rcp28_ps(__m512 a) {
-  // CHECK-LABEL: @test_mm512_rcp28_ps
-  // CHECK: @llvm.x86.avx512.rcp28.ps
-  return _mm512_rcp28_ps(a);
-}
-
-__m512 test_mm512_mask_rcp28_ps(__m512 s, __mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_mask_rcp28_ps
-  // CHECK: @llvm.x86.avx512.rcp28.ps
-  return _mm512_mask_rcp28_ps(s, m, a);
-}
-
-__m512 test_mm512_maskz_rcp28_ps(__mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_maskz_rcp28_ps
-  // CHECK: @llvm.x86.avx512.rcp28.ps
-  return _mm512_maskz_rcp28_ps(m, a);
-}
-
-__m128 test_mm_rcp28_round_ss(__m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_rcp28_round_ss
-  // CHECK: @llvm.x86.avx512.rcp28.ss
-  return _mm_rcp28_round_ss(a, b, _MM_FROUND_NO_EXC);
-}
-
-__m128 test_mm_mask_rcp28_round_ss(__m128 s, __mmask16 m, __m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_mask_rcp28_round_ss
-  // CHECK: @llvm.x86.avx512.rcp28.ss
-  return _mm_mask_rcp28_round_ss(s, m, a, b, _MM_FROUND_NO_EXC);
-}
-
-__m128 test_mm_maskz_rcp28_round_ss(__mmask16 m, __m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_maskz_rcp28_round_ss
-  // CHECK: @llvm.x86.avx512.rcp28.ss
-  return _mm_maskz_rcp28_round_ss(m, a, b, _MM_FROUND_NO_EXC);
-}
-
-__m128 test_mm_rcp28_ss(__m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_rcp28_ss
-  // CHECK: @llvm.x86.avx512.rcp28.ss
-  return _mm_rcp28_ss(a, b);
-}
-
-__m128 test_mm_mask_rcp28_ss(__m128 s, __mmask16 m, __m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_mask_rcp28_ss
-  // CHECK: @llvm.x86.avx512.rcp28.ss
-  return _mm_mask_rcp28_ss(s, m, a, b);
-}
-
-__m128 test_mm_maskz_rcp28_ss(__mmask16 m, __m128 a, __m128 b) {
-  // CHECK-LABEL: @test_mm_maskz_rcp28_ss
-  // CHECK: @llvm.x86.avx512.rcp28.ss
-  return _mm_maskz_rcp28_ss(m, a, b);
-}
-
-__m128d test_mm_rcp28_round_sd(__m128d a, __m128d b) {
-  // CHECK-LABEL: @test_mm_rcp28_round_sd
-  // CHECK: @llvm.x86.avx512.rcp28.sd
-  return _mm_rcp28_round_sd(a, b, _MM_FROUND_NO_EXC);
-}
-
-__m128d test_mm_mask_rcp28_round_sd(__m128d s, __mmask8 m, __m128d a, __m128d b) {
-  // CHECK-LABEL: @test_mm_mask_rcp28_round_sd
-  // CHECK: @llvm.x86.avx512.rcp28.sd
-  return _mm_mask_rcp28_round_sd(s, m, a, b, _MM_FROUND_NO_EXC);
-}
-
-__m128d test_mm_maskz_rcp28_round_sd(__mmask8 m, __m128d a, __m128d b) {
-  // CHECK-LABEL: @test_mm_maskz_rcp28_round_sd
-  // CHECK: @llvm.x86.avx512.rcp28.sd
-  return _mm_maskz_rcp28_round_sd(m, a, b, _MM_FROUND_NO_EXC);
-}
-
-__m128d test_mm_rcp28_sd(__m128d a, __m128d b) {
-  // CHECK-LABEL: @test_mm_rcp28_sd
-  // CHECK: @llvm.x86.avx512.rcp28.sd
-  return _mm_rcp28_sd(a, b);
-}
-
-__m128d test_mm_mask_rcp28_sd(__m128d s, __mmask8 m, __m128d a, __m128d b) {
-  // CHECK-LABEL: @test_mm_mask_rcp28_sd
-  // CHECK: @llvm.x86.avx512.rcp28.sd
-  return _mm_mask_rcp28_sd(s, m, a, b);
-}
-
-__m128d test_mm_maskz_rcp28_sd(__mmask8 m, __m128d a, __m128d b) {
-  // CHECK-LABEL: @test_mm_maskz_rcp28_sd
-  // CHECK: @llvm.x86.avx512.rcp28.sd
-  return _mm_maskz_rcp28_sd(m, a, b);
-}
-
-__m512d test_mm512_exp2a23_round_pd(__m512d a) {
-  // CHECK-LABEL: @test_mm512_exp2a23_round_pd
-  // CHECK: @llvm.x86.avx512.exp2.pd
-  return _mm512_exp2a23_round_pd(a, _MM_FROUND_NO_EXC);
-}
-
-__m512d test_mm512_mask_exp2a23_round_pd(__m512d s, __mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_mask_exp2a23_round_pd
-  // CHECK: @llvm.x86.avx512.exp2.pd
-  return _mm512_mask_exp2a23_round_pd(s, m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512d test_mm512_maskz_exp2a23_round_pd(__mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_maskz_exp2a23_round_pd
-  // CHECK: @llvm.x86.avx512.exp2.pd
-  return _mm512_maskz_exp2a23_round_pd(m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512d test_mm512_exp2a23_pd(__m512d a) {
-  // CHECK-LABEL: @test_mm512_exp2a23_pd
-  // CHECK: @llvm.x86.avx512.exp2.pd
-  return _mm512_exp2a23_pd(a);
-}
-
-__m512d test_mm512_mask_exp2a23_pd(__m512d s, __mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_mask_exp2a23_pd
-  // CHECK: @llvm.x86.avx512.exp2.pd
-  return _mm512_mask_exp2a23_pd(s, m, a);
-}
-
-__m512d test_mm512_maskz_exp2a23_pd(__mmask8 m, __m512d a) {
-  // CHECK-LABEL: @test_mm512_maskz_exp2a23_pd
-  // CHECK: @llvm.x86.avx512.exp2.pd
-  return _mm512_maskz_exp2a23_pd(m, a);
-}
-
-__m512 test_mm512_exp2a23_round_ps(__m512 a) {
-  // CHECK-LABEL: @test_mm512_exp2a23_round_ps
-  // CHECK: @llvm.x86.avx512.exp2.ps
-  return _mm512_exp2a23_round_ps(a, _MM_FROUND_NO_EXC);
-}
-
-__m512 test_mm512_mask_exp2a23_round_ps(__m512 s, __mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_mask_exp2a23_round_ps
-  // CHECK: @llvm.x86.avx512.exp2.ps
-  return _mm512_mask_exp2a23_round_ps(s, m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512 test_mm512_maskz_exp2a23_round_ps(__mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_maskz_exp2a23_round_ps
-  // CHECK: @llvm.x86.avx512.exp2.ps
-  return _mm512_maskz_exp2a23_round_ps(m, a, _MM_FROUND_NO_EXC);
-}
-
-__m512 test_mm512_exp2a23_ps(__m512 a) {
-  // CHECK-LABEL: @test_mm512_exp2a23_ps
-  // CHECK: @llvm.x86.avx512.exp2.ps
-  return _mm512_exp2a23_ps(a);
-}
-
-__m512 test_mm512_mask_exp2a23_ps(__m512 s, __mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_mask_exp2a23_ps
-  // CHECK: @llvm.x86.avx512.exp2.ps
-  return _mm512_mask_exp2a23_ps(s, m, a);
-}
-
-__m512 test_mm512_maskz_exp2a23_ps(__mmask16 m, __m512 a) {
-  // CHECK-LABEL: @test_mm512_maskz_exp2a23_ps
-  // CHECK: @llvm.x86.avx512.exp2.ps
-  return _mm512_maskz_exp2a23_ps(m, a);
-}
-
diff --git a/clang/test/CodeGen/X86/avx512pf-builtins.c b/clang/test/CodeGen/X86/avx512pf-builtins.c
deleted file mode 100644
index 3a117ed6a946..000000000000
--- a/clang/test/CodeGen/X86/avx512pf-builtins.c
+++ /dev/null
@@ -1,100 +0,0 @@
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512pf -emit-llvm -o - -Wall | FileCheck %s
-
-
-#include <immintrin.h>
-
-void test_mm512_mask_prefetch_i32gather_pd(__m256i index, __mmask8 mask, void const *addr) {
-  // CHECK-LABEL: @test_mm512_mask_prefetch_i32gather_pd
-  // CHECK: @llvm.x86.avx512.gatherpf.dpd
-  return _mm512_mask_prefetch_i32gather_pd(index, mask, addr, 2, _MM_HINT_T0); 
-}
-
-void test_mm512_prefetch_i32gather_pd(__m256i index, void const *addr) {
-  // CHECK-LABEL: @test_mm512_prefetch_i32gather_pd
-  // CHECK: @llvm.x86.avx512.gatherpf.dpd
-  return _mm512_prefetch_i32gather_pd(index, addr, 2, _MM_HINT_T0); 
-}
-
-void test_mm512_mask_prefetch_i32gather_ps(__m512i index, __mmask16 mask, void const *addr) {
-  // CHECK-LABEL: @test_mm512_mask_prefetch_i32gather_ps
-  // CHECK: @llvm.x86.avx512.gatherpf.dps
-  return _mm512_mask_prefetch_i32gather_ps(index, mask, addr, 2, _MM_HINT_T0); 
-}
-
-void test_mm512_prefetch_i32gather_ps(__m512i index,  void const *addr) {
-  // CHECK-LABEL: @test_mm512_prefetch_i32gather_ps
-  // CHECK: @llvm.x86.avx512.gatherpf.dps
-  return _mm512_prefetch_i32gather_ps(index, addr, 2, _MM_HINT_T0); 
-}
-
-void test_mm512_mask_prefetch_i64gather_pd(__m512i index, __mmask8 mask, void const *addr) {
-  // CHECK-LABEL: @test_mm512_mask_prefetch_i64gather_pd
-  // CHECK: @llvm.x86.avx512.gatherpf.qpd
-  return _mm512_mask_prefetch_i64gather_pd(index, mask, addr, 2, _MM_HINT_T0); 
-}
-
-void test_mm512_prefetch_i64gather_pd(__m512i index, void const *addr) {
-  // CHECK-LABEL: @test_mm512_prefetch_i64gather_pd
-  // CHECK: @llvm.x86.avx512.gatherpf.qpd
-  return _mm512_prefetch_i64gather_pd(index, addr, 2, _MM_HINT_T0); 
-}
-
-void test_mm512_mask_prefetch_i64gather_ps(__m512i index, __mmask8 mask, void const *addr) {
-  // CHECK-LABEL: @test_mm512_mask_prefetch_i64gather_ps
-  // CHECK: @llvm.x86.avx512.gatherpf.qps
-  return _mm512_mask_prefetch_i64gather_ps(index, mask, addr, 2, _MM_HINT_T0); 
-}
-
-void test_mm512_prefetch_i64gather_ps(__m512i index, void const *addr) {
-  // CHECK-LABEL: @test_mm512_prefetch_i64gather_ps
-  // CHECK: @llvm.x86.avx512.gatherpf.qps
-  return _mm512_prefetch_i64gather_ps(index, addr, 2, _MM_HINT_T0); 
-}
-
-void test_mm512_prefetch_i32scatter_pd(void *addr, __m256i index) {
-  // CHECK-LABEL: @test_mm512_prefetch_i32scatter_pd
-  // CHECK: @llvm.x86.avx512.scatterpf.dpd.512
-  return _mm512_prefetch_i32scatter_pd(addr, index, 1, _MM_HINT_T1); 
-}
-
-void test_mm512_mask_prefetch_i32scatter_pd(void *addr, __mmask8 mask, __m256i index) {
-  // CHECK-LABEL: @test_mm512_mask_prefetch_i32scatter_pd
-  // CHECK: @llvm.x86.avx512.scatterpf.dpd.512
-  return _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, 1, _MM_HINT_T1); 
-}
-
-void test_mm512_prefetch_i32scatter_ps(void *addr, __m512i index) {
-  // CHECK-LABEL: @test_mm512_prefetch_i32scatter_ps
-  // CHECK: @llvm.x86.avx512.scatterpf.dps.512
-  return _mm512_prefetch_i32scatter_ps(addr, index, 1, _MM_HINT_T1); 
-}
-
-void test_mm512_mask_prefetch_i32scatter_ps(void *addr, __mmask16 mask, __m512i index) {
-  // CHECK-LABEL: @test_mm512_mask_prefetch_i32scatter_ps
-  // CHECK: @llvm.x86.avx512.scatterpf.dps.512
-  return _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, 1, _MM_HINT_T1); 
-}
-
-void test_mm512_prefetch_i64scatter_pd(void *addr, __m512i index) {
-  // CHECK-LABEL: @test_mm512_prefetch_i64scatter_pd
-  // CHECK: @llvm.x86.avx512.scatterpf.qpd.512
-  return _mm512_prefetch_i64scatter_pd(addr, index, 1, _MM_HINT_T1); 
-}
-
-void test_mm512_mask_prefetch_i64scatter_pd(void *addr, __mmask16 mask, __m512i index) {
-  // CHECK-LABEL: @test_mm512_mask_prefetch_i64scatter_pd
-  // CHECK: @llvm.x86.avx512.scatterpf.qpd.512
-  return _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, 1, _MM_HINT_T1); 
-}
-
-void test_mm512_prefetch_i64scatter_ps(void *addr, __m512i index) {
-  // CHECK-LABEL: @test_mm512_prefetch_i64scatter_ps
-  // CHECK: @llvm.x86.avx512.scatterpf.qps.512
-  return _mm512_prefetch_i64scatter_ps(addr, index, 1, _MM_HINT_T1); 
-}
-
-void test_mm512_mask_prefetch_i64scatter_ps(void *addr, __mmask16 mask, __m512i index) {
-  // CHECK-LABEL: @test_mm512_mask_prefetch_i64scatter_ps
-  // CHECK: @llvm.x86.avx512.scatterpf.qps.512
-  return _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, 1, _MM_HINT_T1); 
-}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c
index c442d2c0c475..d894e98451b4 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c
@@ -2,12 +2,14 @@
 
 // REQUIRES: aarch64-registered-target
 
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve2p1 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
-#include <arm_sme.h>
+#include <arm_sve.h>
 
 #if defined __ARM_FEATURE_SME
 #define MODE_ATTR __arm_streaming
@@ -16,7 +18,7 @@
 #endif
 
 #ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.§
+// A simple used,unused... macro, long enough to represent any SVE builtin.
 #define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
 #else
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret-bfloat.c
index bf2cd23e4080..41208bfb1f43 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret-bfloat.c
@@ -4,6 +4,10 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x2 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x3 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x4 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x2 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x3 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x4 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x2 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE2
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DTUPLE=x3 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3
@@ -18,9 +22,16 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -DTUPLE=x4 -triple aarch64 -target-feature +sve -target-feature +bf16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE4
 
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR
+#endif
+
 #ifdef TUPLE
 #define TYPE_1(base,tuple) base ## tuple ## _t
 #define TYPE_0(base,tuple) TYPE_1(base,tuple)
@@ -81,7 +92,7 @@
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svint8) test_svreinterpret_s8_bf16(TYPE(svbfloat16) op) {
+TYPE(svint8) test_svreinterpret_s8_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_s8, _bf16)(op);
 }
 
@@ -125,7 +136,7 @@ TYPE(svint8) test_svreinterpret_s8_bf16(TYPE(svbfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svint16) test_svreinterpret_s16_bf16(TYPE(svbfloat16) op) {
+TYPE(svint16) test_svreinterpret_s16_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_s16, _bf16)(op);
 }
 
@@ -169,7 +180,7 @@ TYPE(svint16) test_svreinterpret_s16_bf16(TYPE(svbfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svint32) test_svreinterpret_s32_bf16(TYPE(svbfloat16) op) {
+TYPE(svint32) test_svreinterpret_s32_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_s32, _bf16)(op);
 }
 // CHECK-LABEL: @test_svreinterpret_s64_bf16(
@@ -212,7 +223,7 @@ TYPE(svint32) test_svreinterpret_s32_bf16(TYPE(svbfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svint64) test_svreinterpret_s64_bf16(TYPE(svbfloat16) op) {
+TYPE(svint64) test_svreinterpret_s64_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_s64, _bf16)(op);
 }
 
@@ -256,7 +267,7 @@ TYPE(svint64) test_svreinterpret_s64_bf16(TYPE(svbfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svuint8) test_svreinterpret_u8_bf16(TYPE(svbfloat16) op) {
+TYPE(svuint8) test_svreinterpret_u8_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_u8, _bf16)(op);
 }
 
@@ -300,7 +311,7 @@ TYPE(svuint8) test_svreinterpret_u8_bf16(TYPE(svbfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svuint16) test_svreinterpret_u16_bf16(TYPE(svbfloat16) op) {
+TYPE(svuint16) test_svreinterpret_u16_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_u16, _bf16)(op);
 }
 
@@ -344,7 +355,7 @@ TYPE(svuint16) test_svreinterpret_u16_bf16(TYPE(svbfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svuint32) test_svreinterpret_u32_bf16(TYPE(svbfloat16) op) {
+TYPE(svuint32) test_svreinterpret_u32_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_u32, _bf16)(op);
 }
 
@@ -388,7 +399,7 @@ TYPE(svuint32) test_svreinterpret_u32_bf16(TYPE(svbfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svuint64) test_svreinterpret_u64_bf16(TYPE(svbfloat16) op) {
+TYPE(svuint64) test_svreinterpret_u64_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_u64, _bf16)(op);
 }
 
@@ -432,7 +443,7 @@ TYPE(svuint64) test_svreinterpret_u64_bf16(TYPE(svbfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x bfloat>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_s8(TYPE(svint8) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_s8(TYPE(svint8) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _s8)(op);
 }
 
@@ -476,7 +487,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_s8(TYPE(svint8) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 32 x bfloat>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_s16(TYPE(svint16) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_s16(TYPE(svint16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _s16)(op);
 }
 
@@ -520,7 +531,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_s16(TYPE(svint16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x bfloat>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_s32(TYPE(svint32) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_s32(TYPE(svint32) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _s32)(op);
 }
 
@@ -564,7 +575,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_s32(TYPE(svint32) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x bfloat>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_s64(TYPE(svint64) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_s64(TYPE(svint64) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _s64)(op);
 }
 
@@ -608,7 +619,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_s64(TYPE(svint64) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x bfloat>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_u8(TYPE(svuint8) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_u8(TYPE(svuint8) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _u8)(op);
 }
 
@@ -652,7 +663,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_u8(TYPE(svuint8) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 32 x bfloat>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_u16(TYPE(svuint16) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_u16(TYPE(svuint16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _u16)(op);
 }
 
@@ -696,7 +707,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_u16(TYPE(svuint16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x bfloat>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_u32(TYPE(svuint32) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_u32(TYPE(svuint32) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _u32)(op);
 }
 
@@ -740,7 +751,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_u32(TYPE(svuint32) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x bfloat>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_u64(TYPE(svuint64) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_u64(TYPE(svuint64) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _u64)(op);
 }
 
@@ -776,7 +787,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_u64(TYPE(svuint64) op) {
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[OP:%.*]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_bf16(TYPE(svbfloat16) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _bf16)(op);
 }
 
@@ -820,7 +831,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_bf16(TYPE(svbfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 32 x bfloat>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_f16(TYPE(svfloat16) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_f16(TYPE(svfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _f16)(op);
 }
 
@@ -864,7 +875,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_f16(TYPE(svfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 32 x bfloat>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_f32(TYPE(svfloat32) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_f32(TYPE(svfloat32) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _f32)(op);
 }
 
@@ -908,7 +919,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_f32(TYPE(svfloat32) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 32 x bfloat>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
 //
-TYPE(svbfloat16) test_svreinterpret_bf16_f64(TYPE(svfloat64) op) {
+TYPE(svbfloat16) test_svreinterpret_bf16_f64(TYPE(svfloat64) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_bf16, _f64)(op);
 }
 
@@ -952,7 +963,7 @@ TYPE(svbfloat16) test_svreinterpret_bf16_f64(TYPE(svfloat64) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 16 x float>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_bf16(TYPE(svbfloat16) op) {
+TYPE(svfloat32) test_svreinterpret_f32_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_f32, _bf16)(op);
 }
 
@@ -996,7 +1007,7 @@ TYPE(svfloat32) test_svreinterpret_f32_bf16(TYPE(svbfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 32 x half>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_bf16(TYPE(svbfloat16) op) {
+TYPE(svfloat16) test_svreinterpret_f16_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_f16, _bf16)(op);
 }
 
@@ -1040,6 +1051,6 @@ TYPE(svfloat16) test_svreinterpret_f16_bf16(TYPE(svbfloat16) op) {
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x bfloat> [[OP:%.*]] to <vscale x 8 x double>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_bf16(TYPE(svbfloat16) op) {
+TYPE(svfloat64) test_svreinterpret_f64_bf16(TYPE(svbfloat16) op) MODE_ATTR {
   return SVE_ACLE_FUNC(svreinterpret_f64, _bf16)(op);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret.c
index 3d9d5c3ce45a..e61bbf3e03d7 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret.c
@@ -4,6 +4,10 @@
 // RUN: %clang_cc1 -DTUPLE=x2 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
 // RUN: %clang_cc1 -DTUPLE=x3 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
 // RUN: %clang_cc1 -DTUPLE=x4 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DTUPLE=x2 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE2
+// RUN: %clang_cc1 -DTUPLE=x3 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE3
+// RUN: %clang_cc1 -DTUPLE=x4 -triple aarch64 -target-feature +sme -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=TUPLE4
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DTUPLE=x2 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE2
 // RUN: %clang_cc1 -DTUPLE=x3 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3
@@ -17,9 +21,16 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x3 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE3
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -DTUPLE=x4 -triple aarch64 -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-TUPLE4
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
 #include <arm_sve.h>
 
+#if defined __ARM_FEATURE_SME
+#define MODE_ATTR __arm_streaming
+#else
+#define MODE_ATTR
+#endif
+
 #ifdef TUPLE
 #define TYPE_1(base,tuple) base ## tuple ## _t
 #define TYPE_0(base,tuple) TYPE_1(base,tuple)
@@ -72,7 +83,7 @@
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[OP:%.*]]
 //
-TYPE(svint8) test_svreinterpret_s8_s8(TYPE(svint8) op)
+TYPE(svint8) test_svreinterpret_s8_s8(TYPE(svint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s8,_s8)(op);
 }
@@ -117,7 +128,7 @@ TYPE(svint8) test_svreinterpret_s8_s8(TYPE(svint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svint8) test_svreinterpret_s8_s16(TYPE(svint16) op)
+TYPE(svint8) test_svreinterpret_s8_s16(TYPE(svint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s8,_s16)(op);
 }
@@ -162,7 +173,7 @@ TYPE(svint8) test_svreinterpret_s8_s16(TYPE(svint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svint8) test_svreinterpret_s8_s32(TYPE(svint32) op)
+TYPE(svint8) test_svreinterpret_s8_s32(TYPE(svint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s8,_s32)(op);
 }
@@ -207,7 +218,7 @@ TYPE(svint8) test_svreinterpret_s8_s32(TYPE(svint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svint8) test_svreinterpret_s8_s64(TYPE(svint64) op)
+TYPE(svint8) test_svreinterpret_s8_s64(TYPE(svint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s8,_s64)(op);
 }
@@ -244,7 +255,7 @@ TYPE(svint8) test_svreinterpret_s8_s64(TYPE(svint64) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[OP:%.*]]
 //
-TYPE(svint8) test_svreinterpret_s8_u8(TYPE(svuint8) op)
+TYPE(svint8) test_svreinterpret_s8_u8(TYPE(svuint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s8,_u8)(op);
 }
@@ -289,7 +300,7 @@ TYPE(svint8) test_svreinterpret_s8_u8(TYPE(svuint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svint8) test_svreinterpret_s8_u16(TYPE(svuint16) op)
+TYPE(svint8) test_svreinterpret_s8_u16(TYPE(svuint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s8,_u16)(op);
 }
@@ -335,7 +346,7 @@ TYPE(svint8) test_svreinterpret_s8_u16(TYPE(svuint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svint8) test_svreinterpret_s8_u32(TYPE(svuint32) op)
+TYPE(svint8) test_svreinterpret_s8_u32(TYPE(svuint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s8,_u32)(op);
 }
@@ -381,7 +392,7 @@ TYPE(svint8) test_svreinterpret_s8_u32(TYPE(svuint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svint8) test_svreinterpret_s8_u64(TYPE(svuint64) op)
+TYPE(svint8) test_svreinterpret_s8_u64(TYPE(svuint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s8,_u64)(op);
 }
@@ -426,7 +437,7 @@ TYPE(svint8) test_svreinterpret_s8_u64(TYPE(svuint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svint8) test_svreinterpret_s8_f16(TYPE(svfloat16) op)
+TYPE(svint8) test_svreinterpret_s8_f16(TYPE(svfloat16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s8,_f16)(op);
 }
@@ -471,7 +482,7 @@ TYPE(svint8) test_svreinterpret_s8_f16(TYPE(svfloat16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svint8) test_svreinterpret_s8_f32(TYPE(svfloat32) op)
+TYPE(svint8) test_svreinterpret_s8_f32(TYPE(svfloat32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s8,_f32)(op);
 }
@@ -516,7 +527,7 @@ TYPE(svint8) test_svreinterpret_s8_f32(TYPE(svfloat32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svint8) test_svreinterpret_s8_f64(TYPE(svfloat64) op)
+TYPE(svint8) test_svreinterpret_s8_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s8,_f64)(op);
 }
@@ -561,7 +572,7 @@ TYPE(svint8) test_svreinterpret_s8_f64(TYPE(svfloat64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svint16) test_svreinterpret_s16_s8(TYPE(svint8) op)
+TYPE(svint16) test_svreinterpret_s16_s8(TYPE(svint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s16,_s8)(op);
 }
@@ -598,7 +609,7 @@ TYPE(svint16) test_svreinterpret_s16_s8(TYPE(svint8) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[OP:%.*]]
 //
-TYPE(svint16) test_svreinterpret_s16_s16(TYPE(svint16) op)
+TYPE(svint16) test_svreinterpret_s16_s16(TYPE(svint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s16,_s16)(op);
 }
@@ -643,7 +654,7 @@ TYPE(svint16) test_svreinterpret_s16_s16(TYPE(svint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svint16) test_svreinterpret_s16_s32(TYPE(svint32) op)
+TYPE(svint16) test_svreinterpret_s16_s32(TYPE(svint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s16,_s32)(op);
 }
@@ -688,7 +699,7 @@ TYPE(svint16) test_svreinterpret_s16_s32(TYPE(svint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svint16) test_svreinterpret_s16_s64(TYPE(svint64) op)
+TYPE(svint16) test_svreinterpret_s16_s64(TYPE(svint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s16,_s64)(op);
 }
@@ -733,7 +744,7 @@ TYPE(svint16) test_svreinterpret_s16_s64(TYPE(svint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svint16) test_svreinterpret_s16_u8(TYPE(svuint8) op)
+TYPE(svint16) test_svreinterpret_s16_u8(TYPE(svuint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s16,_u8)(op);
 }
@@ -770,7 +781,7 @@ TYPE(svint16) test_svreinterpret_s16_u8(TYPE(svuint8) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[OP:%.*]]
 //
-TYPE(svint16) test_svreinterpret_s16_u16(TYPE(svuint16) op)
+TYPE(svint16) test_svreinterpret_s16_u16(TYPE(svuint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s16,_u16)(op);
 }
@@ -815,7 +826,7 @@ TYPE(svint16) test_svreinterpret_s16_u16(TYPE(svuint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svint16) test_svreinterpret_s16_u32(TYPE(svuint32) op)
+TYPE(svint16) test_svreinterpret_s16_u32(TYPE(svuint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s16,_u32)(op);
 }
@@ -860,7 +871,7 @@ TYPE(svint16) test_svreinterpret_s16_u32(TYPE(svuint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svint16) test_svreinterpret_s16_u64(TYPE(svuint64) op)
+TYPE(svint16) test_svreinterpret_s16_u64(TYPE(svuint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s16,_u64)(op);
 }
@@ -905,7 +916,7 @@ TYPE(svint16) test_svreinterpret_s16_u64(TYPE(svuint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svint16) test_svreinterpret_s16_f16(TYPE(svfloat16) op)
+TYPE(svint16) test_svreinterpret_s16_f16(TYPE(svfloat16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s16,_f16)(op);
 }
@@ -950,7 +961,7 @@ TYPE(svint16) test_svreinterpret_s16_f16(TYPE(svfloat16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svint16) test_svreinterpret_s16_f32(TYPE(svfloat32) op)
+TYPE(svint16) test_svreinterpret_s16_f32(TYPE(svfloat32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s16,_f32)(op);
 }
@@ -995,7 +1006,7 @@ TYPE(svint16) test_svreinterpret_s16_f32(TYPE(svfloat32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svint16) test_svreinterpret_s16_f64(TYPE(svfloat64) op)
+TYPE(svint16) test_svreinterpret_s16_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s16,_f64)(op);
 }
@@ -1040,7 +1051,7 @@ TYPE(svint16) test_svreinterpret_s16_f64(TYPE(svfloat64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svint32) test_svreinterpret_s32_s8(TYPE(svint8) op)
+TYPE(svint32) test_svreinterpret_s32_s8(TYPE(svint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s32,_s8)(op);
 }
@@ -1085,7 +1096,7 @@ TYPE(svint32) test_svreinterpret_s32_s8(TYPE(svint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svint32) test_svreinterpret_s32_s16(TYPE(svint16) op)
+TYPE(svint32) test_svreinterpret_s32_s16(TYPE(svint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s32,_s16)(op);
 }
@@ -1122,7 +1133,7 @@ TYPE(svint32) test_svreinterpret_s32_s16(TYPE(svint16) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[OP:%.*]]
 //
-TYPE(svint32) test_svreinterpret_s32_s32(TYPE(svint32) op)
+TYPE(svint32) test_svreinterpret_s32_s32(TYPE(svint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s32,_s32)(op);
 }
@@ -1167,7 +1178,7 @@ TYPE(svint32) test_svreinterpret_s32_s32(TYPE(svint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svint32) test_svreinterpret_s32_s64(TYPE(svint64) op)
+TYPE(svint32) test_svreinterpret_s32_s64(TYPE(svint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s32,_s64)(op);
 }
@@ -1212,7 +1223,7 @@ TYPE(svint32) test_svreinterpret_s32_s64(TYPE(svint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svint32) test_svreinterpret_s32_u8(TYPE(svuint8) op)
+TYPE(svint32) test_svreinterpret_s32_u8(TYPE(svuint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s32,_u8)(op);
 }
@@ -1257,7 +1268,7 @@ TYPE(svint32) test_svreinterpret_s32_u8(TYPE(svuint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svint32) test_svreinterpret_s32_u16(TYPE(svuint16) op)
+TYPE(svint32) test_svreinterpret_s32_u16(TYPE(svuint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s32,_u16)(op);
 }
@@ -1294,7 +1305,7 @@ TYPE(svint32) test_svreinterpret_s32_u16(TYPE(svuint16) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[OP:%.*]]
 //
-TYPE(svint32) test_svreinterpret_s32_u32(TYPE(svuint32) op)
+TYPE(svint32) test_svreinterpret_s32_u32(TYPE(svuint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s32,_u32)(op);
 }
@@ -1339,7 +1350,7 @@ TYPE(svint32) test_svreinterpret_s32_u32(TYPE(svuint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svint32) test_svreinterpret_s32_u64(TYPE(svuint64) op)
+TYPE(svint32) test_svreinterpret_s32_u64(TYPE(svuint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s32,_u64)(op);
 }
@@ -1384,7 +1395,7 @@ TYPE(svint32) test_svreinterpret_s32_u64(TYPE(svuint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svint32) test_svreinterpret_s32_f16(TYPE(svfloat16) op)
+TYPE(svint32) test_svreinterpret_s32_f16(TYPE(svfloat16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s32,_f16)(op);
 }
@@ -1429,7 +1440,7 @@ TYPE(svint32) test_svreinterpret_s32_f16(TYPE(svfloat16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svint32) test_svreinterpret_s32_f32(TYPE(svfloat32) op)
+TYPE(svint32) test_svreinterpret_s32_f32(TYPE(svfloat32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s32,_f32)(op);
 }
@@ -1475,7 +1486,7 @@ TYPE(svint32) test_svreinterpret_s32_f32(TYPE(svfloat32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svint32) test_svreinterpret_s32_f64(TYPE(svfloat64) op)
+TYPE(svint32) test_svreinterpret_s32_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s32,_f64)(op);
 }
@@ -1520,7 +1531,7 @@ TYPE(svint32) test_svreinterpret_s32_f64(TYPE(svfloat64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svint64) test_svreinterpret_s64_s8(TYPE(svint8) op)
+TYPE(svint64) test_svreinterpret_s64_s8(TYPE(svint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s64,_s8)(op);
 }
@@ -1565,7 +1576,7 @@ TYPE(svint64) test_svreinterpret_s64_s8(TYPE(svint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svint64) test_svreinterpret_s64_s16(TYPE(svint16) op)
+TYPE(svint64) test_svreinterpret_s64_s16(TYPE(svint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s64,_s16)(op);
 }
@@ -1610,7 +1621,7 @@ TYPE(svint64) test_svreinterpret_s64_s16(TYPE(svint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svint64) test_svreinterpret_s64_s32(TYPE(svint32) op)
+TYPE(svint64) test_svreinterpret_s64_s32(TYPE(svint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s64,_s32)(op);
 }
@@ -1647,7 +1658,7 @@ TYPE(svint64) test_svreinterpret_s64_s32(TYPE(svint32) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[OP:%.*]]
 //
-TYPE(svint64) test_svreinterpret_s64_s64(TYPE(svint64) op)
+TYPE(svint64) test_svreinterpret_s64_s64(TYPE(svint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s64,_s64)(op);
 }
@@ -1692,7 +1703,7 @@ TYPE(svint64) test_svreinterpret_s64_s64(TYPE(svint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svint64) test_svreinterpret_s64_u8(TYPE(svuint8) op)
+TYPE(svint64) test_svreinterpret_s64_u8(TYPE(svuint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s64,_u8)(op);
 }
@@ -1737,7 +1748,7 @@ TYPE(svint64) test_svreinterpret_s64_u8(TYPE(svuint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svint64) test_svreinterpret_s64_u16(TYPE(svuint16) op)
+TYPE(svint64) test_svreinterpret_s64_u16(TYPE(svuint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s64,_u16)(op);
 }
@@ -1782,7 +1793,7 @@ TYPE(svint64) test_svreinterpret_s64_u16(TYPE(svuint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svint64) test_svreinterpret_s64_u32(TYPE(svuint32) op)
+TYPE(svint64) test_svreinterpret_s64_u32(TYPE(svuint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s64,_u32)(op);
 }
@@ -1819,7 +1830,7 @@ TYPE(svint64) test_svreinterpret_s64_u32(TYPE(svuint32) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[OP:%.*]]
 //
-TYPE(svint64) test_svreinterpret_s64_u64(TYPE(svuint64) op)
+TYPE(svint64) test_svreinterpret_s64_u64(TYPE(svuint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s64,_u64)(op);
 }
@@ -1864,7 +1875,7 @@ TYPE(svint64) test_svreinterpret_s64_u64(TYPE(svuint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svint64) test_svreinterpret_s64_f16(TYPE(svfloat16) op)
+TYPE(svint64) test_svreinterpret_s64_f16(TYPE(svfloat16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s64,_f16)(op);
 }
@@ -1909,7 +1920,7 @@ TYPE(svint64) test_svreinterpret_s64_f16(TYPE(svfloat16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svint64) test_svreinterpret_s64_f32(TYPE(svfloat32) op)
+TYPE(svint64) test_svreinterpret_s64_f32(TYPE(svfloat32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s64,_f32)(op);
 }
@@ -1954,7 +1965,7 @@ TYPE(svint64) test_svreinterpret_s64_f32(TYPE(svfloat32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svint64) test_svreinterpret_s64_f64(TYPE(svfloat64) op)
+TYPE(svint64) test_svreinterpret_s64_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_s64,_f64)(op);
 }
@@ -1991,7 +2002,7 @@ TYPE(svint64) test_svreinterpret_s64_f64(TYPE(svfloat64) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[OP:%.*]]
 //
-TYPE(svuint8) test_svreinterpret_u8_s8(TYPE(svint8) op)
+TYPE(svuint8) test_svreinterpret_u8_s8(TYPE(svint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u8,_s8)(op);
 }
@@ -2036,7 +2047,7 @@ TYPE(svuint8) test_svreinterpret_u8_s8(TYPE(svint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svuint8) test_svreinterpret_u8_s16(TYPE(svint16) op)
+TYPE(svuint8) test_svreinterpret_u8_s16(TYPE(svint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u8,_s16)(op);
 }
@@ -2081,7 +2092,7 @@ TYPE(svuint8) test_svreinterpret_u8_s16(TYPE(svint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svuint8) test_svreinterpret_u8_s32(TYPE(svint32) op)
+TYPE(svuint8) test_svreinterpret_u8_s32(TYPE(svint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u8,_s32)(op);
 }
@@ -2126,7 +2137,7 @@ TYPE(svuint8) test_svreinterpret_u8_s32(TYPE(svint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svuint8) test_svreinterpret_u8_s64(TYPE(svint64) op)
+TYPE(svuint8) test_svreinterpret_u8_s64(TYPE(svint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u8,_s64)(op);
 }
@@ -2163,7 +2174,7 @@ TYPE(svuint8) test_svreinterpret_u8_s64(TYPE(svint64) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[OP:%.*]]
 //
-TYPE(svuint8) test_svreinterpret_u8_u8(TYPE(svuint8) op)
+TYPE(svuint8) test_svreinterpret_u8_u8(TYPE(svuint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u8,_u8)(op);
 }
@@ -2208,7 +2219,7 @@ TYPE(svuint8) test_svreinterpret_u8_u8(TYPE(svuint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svuint8) test_svreinterpret_u8_u16(TYPE(svuint16) op)
+TYPE(svuint8) test_svreinterpret_u8_u16(TYPE(svuint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u8,_u16)(op);
 }
@@ -2253,7 +2264,7 @@ TYPE(svuint8) test_svreinterpret_u8_u16(TYPE(svuint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svuint8) test_svreinterpret_u8_u32(TYPE(svuint32) op)
+TYPE(svuint8) test_svreinterpret_u8_u32(TYPE(svuint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u8,_u32)(op);
 }
@@ -2298,7 +2309,7 @@ TYPE(svuint8) test_svreinterpret_u8_u32(TYPE(svuint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svuint8) test_svreinterpret_u8_u64(TYPE(svuint64) op)
+TYPE(svuint8) test_svreinterpret_u8_u64(TYPE(svuint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u8,_u64)(op);
 }
@@ -2343,7 +2354,7 @@ TYPE(svuint8) test_svreinterpret_u8_u64(TYPE(svuint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svuint8) test_svreinterpret_u8_f16(TYPE(svfloat16) op)
+TYPE(svuint8) test_svreinterpret_u8_f16(TYPE(svfloat16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u8,_f16)(op);
 }
@@ -2388,7 +2399,7 @@ TYPE(svuint8) test_svreinterpret_u8_f16(TYPE(svfloat16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svuint8) test_svreinterpret_u8_f32(TYPE(svfloat32) op)
+TYPE(svuint8) test_svreinterpret_u8_f32(TYPE(svfloat32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u8,_f32)(op);
 }
@@ -2433,7 +2444,7 @@ TYPE(svuint8) test_svreinterpret_u8_f32(TYPE(svfloat32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 64 x i8>
 // CPP-TUPLE4-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
 //
-TYPE(svuint8) test_svreinterpret_u8_f64(TYPE(svfloat64) op)
+TYPE(svuint8) test_svreinterpret_u8_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u8,_f64)(op);
 }
@@ -2478,7 +2489,7 @@ TYPE(svuint8) test_svreinterpret_u8_f64(TYPE(svfloat64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svuint16) test_svreinterpret_u16_s8(TYPE(svint8) op)
+TYPE(svuint16) test_svreinterpret_u16_s8(TYPE(svint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u16,_s8)(op);
 }
@@ -2515,7 +2526,7 @@ TYPE(svuint16) test_svreinterpret_u16_s8(TYPE(svint8) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[OP:%.*]]
 //
-TYPE(svuint16) test_svreinterpret_u16_s16(TYPE(svint16) op)
+TYPE(svuint16) test_svreinterpret_u16_s16(TYPE(svint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u16,_s16)(op);
 }
@@ -2560,7 +2571,7 @@ TYPE(svuint16) test_svreinterpret_u16_s16(TYPE(svint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svuint16) test_svreinterpret_u16_s32(TYPE(svint32) op)
+TYPE(svuint16) test_svreinterpret_u16_s32(TYPE(svint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u16,_s32)(op);
 }
@@ -2605,7 +2616,7 @@ TYPE(svuint16) test_svreinterpret_u16_s32(TYPE(svint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svuint16) test_svreinterpret_u16_s64(TYPE(svint64) op)
+TYPE(svuint16) test_svreinterpret_u16_s64(TYPE(svint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u16,_s64)(op);
 }
@@ -2650,7 +2661,7 @@ TYPE(svuint16) test_svreinterpret_u16_s64(TYPE(svint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svuint16) test_svreinterpret_u16_u8(TYPE(svuint8) op)
+TYPE(svuint16) test_svreinterpret_u16_u8(TYPE(svuint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u16,_u8)(op);
 }
@@ -2687,7 +2698,7 @@ TYPE(svuint16) test_svreinterpret_u16_u8(TYPE(svuint8) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[OP:%.*]]
 //
-TYPE(svuint16) test_svreinterpret_u16_u16(TYPE(svuint16) op)
+TYPE(svuint16) test_svreinterpret_u16_u16(TYPE(svuint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u16,_u16)(op);
 }
@@ -2732,7 +2743,7 @@ TYPE(svuint16) test_svreinterpret_u16_u16(TYPE(svuint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svuint16) test_svreinterpret_u16_u32(TYPE(svuint32) op)
+TYPE(svuint16) test_svreinterpret_u16_u32(TYPE(svuint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u16,_u32)(op);
 }
@@ -2777,7 +2788,7 @@ TYPE(svuint16) test_svreinterpret_u16_u32(TYPE(svuint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svuint16) test_svreinterpret_u16_u64(TYPE(svuint64) op)
+TYPE(svuint16) test_svreinterpret_u16_u64(TYPE(svuint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u16,_u64)(op);
 }
@@ -2822,7 +2833,7 @@ TYPE(svuint16) test_svreinterpret_u16_u64(TYPE(svuint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svuint16) test_svreinterpret_u16_f16(TYPE(svfloat16) op)
+TYPE(svuint16) test_svreinterpret_u16_f16(TYPE(svfloat16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u16,_f16)(op);
 }
@@ -2867,7 +2878,7 @@ TYPE(svuint16) test_svreinterpret_u16_f16(TYPE(svfloat16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svuint16) test_svreinterpret_u16_f32(TYPE(svfloat32) op)
+TYPE(svuint16) test_svreinterpret_u16_f32(TYPE(svfloat32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u16,_f32)(op);
 }
@@ -2912,7 +2923,7 @@ TYPE(svuint16) test_svreinterpret_u16_f32(TYPE(svfloat32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 32 x i16>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
 //
-TYPE(svuint16) test_svreinterpret_u16_f64(TYPE(svfloat64) op)
+TYPE(svuint16) test_svreinterpret_u16_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u16,_f64)(op);
 }
@@ -2957,7 +2968,7 @@ TYPE(svuint16) test_svreinterpret_u16_f64(TYPE(svfloat64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svuint32) test_svreinterpret_u32_s8(TYPE(svint8) op)
+TYPE(svuint32) test_svreinterpret_u32_s8(TYPE(svint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u32,_s8)(op);
 }
@@ -3002,7 +3013,7 @@ TYPE(svuint32) test_svreinterpret_u32_s8(TYPE(svint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svuint32) test_svreinterpret_u32_s16(TYPE(svint16) op)
+TYPE(svuint32) test_svreinterpret_u32_s16(TYPE(svint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u32,_s16)(op);
 }
@@ -3039,7 +3050,7 @@ TYPE(svuint32) test_svreinterpret_u32_s16(TYPE(svint16) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[OP:%.*]]
 //
-TYPE(svuint32) test_svreinterpret_u32_s32(TYPE(svint32) op)
+TYPE(svuint32) test_svreinterpret_u32_s32(TYPE(svint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u32,_s32)(op);
 }
@@ -3084,7 +3095,7 @@ TYPE(svuint32) test_svreinterpret_u32_s32(TYPE(svint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svuint32) test_svreinterpret_u32_s64(TYPE(svint64) op)
+TYPE(svuint32) test_svreinterpret_u32_s64(TYPE(svint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u32,_s64)(op);
 }
@@ -3129,7 +3140,7 @@ TYPE(svuint32) test_svreinterpret_u32_s64(TYPE(svint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svuint32) test_svreinterpret_u32_u8(TYPE(svuint8) op)
+TYPE(svuint32) test_svreinterpret_u32_u8(TYPE(svuint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u32,_u8)(op);
 }
@@ -3174,7 +3185,7 @@ TYPE(svuint32) test_svreinterpret_u32_u8(TYPE(svuint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svuint32) test_svreinterpret_u32_u16(TYPE(svuint16) op)
+TYPE(svuint32) test_svreinterpret_u32_u16(TYPE(svuint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u32,_u16)(op);
 }
@@ -3211,7 +3222,7 @@ TYPE(svuint32) test_svreinterpret_u32_u16(TYPE(svuint16) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[OP:%.*]]
 //
-TYPE(svuint32) test_svreinterpret_u32_u32(TYPE(svuint32) op)
+TYPE(svuint32) test_svreinterpret_u32_u32(TYPE(svuint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u32,_u32)(op);
 }
@@ -3256,7 +3267,7 @@ TYPE(svuint32) test_svreinterpret_u32_u32(TYPE(svuint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svuint32) test_svreinterpret_u32_u64(TYPE(svuint64) op)
+TYPE(svuint32) test_svreinterpret_u32_u64(TYPE(svuint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u32,_u64)(op);
 }
@@ -3301,7 +3312,7 @@ TYPE(svuint32) test_svreinterpret_u32_u64(TYPE(svuint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svuint32) test_svreinterpret_u32_f16(TYPE(svfloat16) op)
+TYPE(svuint32) test_svreinterpret_u32_f16(TYPE(svfloat16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u32,_f16)(op);
 }
@@ -3346,7 +3357,7 @@ TYPE(svuint32) test_svreinterpret_u32_f16(TYPE(svfloat16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svuint32) test_svreinterpret_u32_f32(TYPE(svfloat32) op)
+TYPE(svuint32) test_svreinterpret_u32_f32(TYPE(svfloat32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u32,_f32)(op);
 }
@@ -3391,7 +3402,7 @@ TYPE(svuint32) test_svreinterpret_u32_f32(TYPE(svfloat32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 16 x i32>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-TYPE(svuint32) test_svreinterpret_u32_f64(TYPE(svfloat64) op)
+TYPE(svuint32) test_svreinterpret_u32_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u32,_f64)(op);
 }
@@ -3436,7 +3447,7 @@ TYPE(svuint32) test_svreinterpret_u32_f64(TYPE(svfloat64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svuint64) test_svreinterpret_u64_s8(TYPE(svint8) op)
+TYPE(svuint64) test_svreinterpret_u64_s8(TYPE(svint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u64,_s8)(op);
 }
@@ -3481,7 +3492,7 @@ TYPE(svuint64) test_svreinterpret_u64_s8(TYPE(svint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svuint64) test_svreinterpret_u64_s16(TYPE(svint16) op)
+TYPE(svuint64) test_svreinterpret_u64_s16(TYPE(svint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u64,_s16)(op);
 }
@@ -3526,7 +3537,7 @@ TYPE(svuint64) test_svreinterpret_u64_s16(TYPE(svint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svuint64) test_svreinterpret_u64_s32(TYPE(svint32) op)
+TYPE(svuint64) test_svreinterpret_u64_s32(TYPE(svint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u64,_s32)(op);
 }
@@ -3563,7 +3574,7 @@ TYPE(svuint64) test_svreinterpret_u64_s32(TYPE(svint32) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[OP:%.*]]
 //
-TYPE(svuint64) test_svreinterpret_u64_s64(TYPE(svint64) op)
+TYPE(svuint64) test_svreinterpret_u64_s64(TYPE(svint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u64,_s64)(op);
 }
@@ -3608,7 +3619,7 @@ TYPE(svuint64) test_svreinterpret_u64_s64(TYPE(svint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svuint64) test_svreinterpret_u64_u8(TYPE(svuint8) op)
+TYPE(svuint64) test_svreinterpret_u64_u8(TYPE(svuint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u64,_u8)(op);
 }
@@ -3653,7 +3664,7 @@ TYPE(svuint64) test_svreinterpret_u64_u8(TYPE(svuint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svuint64) test_svreinterpret_u64_u16(TYPE(svuint16) op)
+TYPE(svuint64) test_svreinterpret_u64_u16(TYPE(svuint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u64,_u16)(op);
 }
@@ -3698,7 +3709,7 @@ TYPE(svuint64) test_svreinterpret_u64_u16(TYPE(svuint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svuint64) test_svreinterpret_u64_u32(TYPE(svuint32) op)
+TYPE(svuint64) test_svreinterpret_u64_u32(TYPE(svuint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u64,_u32)(op);
 }
@@ -3735,7 +3746,7 @@ TYPE(svuint64) test_svreinterpret_u64_u32(TYPE(svuint32) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[OP:%.*]]
 //
-TYPE(svuint64) test_svreinterpret_u64_u64(TYPE(svuint64) op)
+TYPE(svuint64) test_svreinterpret_u64_u64(TYPE(svuint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u64,_u64)(op);
 }
@@ -3780,7 +3791,7 @@ TYPE(svuint64) test_svreinterpret_u64_u64(TYPE(svuint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svuint64) test_svreinterpret_u64_f16(TYPE(svfloat16) op)
+TYPE(svuint64) test_svreinterpret_u64_f16(TYPE(svfloat16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u64,_f16)(op);
 }
@@ -3825,7 +3836,7 @@ TYPE(svuint64) test_svreinterpret_u64_f16(TYPE(svfloat16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svuint64) test_svreinterpret_u64_f32(TYPE(svfloat32) op)
+TYPE(svuint64) test_svreinterpret_u64_f32(TYPE(svfloat32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u64,_f32)(op);
 }
@@ -3870,7 +3881,7 @@ TYPE(svuint64) test_svreinterpret_u64_f32(TYPE(svfloat32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 8 x i64>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
 //
-TYPE(svuint64) test_svreinterpret_u64_f64(TYPE(svfloat64) op)
+TYPE(svuint64) test_svreinterpret_u64_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_u64,_f64)(op);
 }
@@ -3915,7 +3926,7 @@ TYPE(svuint64) test_svreinterpret_u64_f64(TYPE(svfloat64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x half>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_s8(TYPE(svint8) op)
+TYPE(svfloat16) test_svreinterpret_f16_s8(TYPE(svint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f16,_s8)(op);
 }
@@ -3960,7 +3971,7 @@ TYPE(svfloat16) test_svreinterpret_f16_s8(TYPE(svint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 32 x half>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_s16(TYPE(svint16) op)
+TYPE(svfloat16) test_svreinterpret_f16_s16(TYPE(svint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f16,_s16)(op);
 }
@@ -4005,7 +4016,7 @@ TYPE(svfloat16) test_svreinterpret_f16_s16(TYPE(svint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x half>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_s32(TYPE(svint32) op)
+TYPE(svfloat16) test_svreinterpret_f16_s32(TYPE(svint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f16,_s32)(op);
 }
@@ -4050,7 +4061,7 @@ TYPE(svfloat16) test_svreinterpret_f16_s32(TYPE(svint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x half>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_s64(TYPE(svint64) op)
+TYPE(svfloat16) test_svreinterpret_f16_s64(TYPE(svint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f16,_s64)(op);
 }
@@ -4095,7 +4106,7 @@ TYPE(svfloat16) test_svreinterpret_f16_s64(TYPE(svint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 32 x half>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_u8(TYPE(svuint8) op)
+TYPE(svfloat16) test_svreinterpret_f16_u8(TYPE(svuint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f16,_u8)(op);
 }
@@ -4140,7 +4151,7 @@ TYPE(svfloat16) test_svreinterpret_f16_u8(TYPE(svuint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 32 x half>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_u16(TYPE(svuint16) op)
+TYPE(svfloat16) test_svreinterpret_f16_u16(TYPE(svuint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f16,_u16)(op);
 }
@@ -4185,7 +4196,7 @@ TYPE(svfloat16) test_svreinterpret_f16_u16(TYPE(svuint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 32 x half>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_u32(TYPE(svuint32) op)
+TYPE(svfloat16) test_svreinterpret_f16_u32(TYPE(svuint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f16,_u32)(op);
 }
@@ -4230,7 +4241,7 @@ TYPE(svfloat16) test_svreinterpret_f16_u32(TYPE(svuint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 32 x half>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_u64(TYPE(svuint64) op)
+TYPE(svfloat16) test_svreinterpret_f16_u64(TYPE(svuint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f16,_u64)(op);
 }
@@ -4267,7 +4278,7 @@ TYPE(svfloat16) test_svreinterpret_f16_u64(TYPE(svuint64) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[OP:%.*]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_f16(TYPE(svfloat16) op)
+TYPE(svfloat16) test_svreinterpret_f16_f16(TYPE(svfloat16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f16,_f16)(op);
 }
@@ -4312,7 +4323,7 @@ TYPE(svfloat16) test_svreinterpret_f16_f16(TYPE(svfloat16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 32 x half>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_f32(TYPE(svfloat32) op)
+TYPE(svfloat16) test_svreinterpret_f16_f32(TYPE(svfloat32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f16,_f32)(op);
 }
@@ -4357,7 +4368,7 @@ TYPE(svfloat16) test_svreinterpret_f16_f32(TYPE(svfloat32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 32 x half>
 // CPP-TUPLE4-NEXT:    ret <vscale x 32 x half> [[TMP0]]
 //
-TYPE(svfloat16) test_svreinterpret_f16_f64(TYPE(svfloat64) op)
+TYPE(svfloat16) test_svreinterpret_f16_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f16,_f64)(op);
 }
@@ -4402,7 +4413,7 @@ TYPE(svfloat16) test_svreinterpret_f16_f64(TYPE(svfloat64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x float>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_s8(TYPE(svint8) op)
+TYPE(svfloat32) test_svreinterpret_f32_s8(TYPE(svint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f32,_s8)(op);
 }
@@ -4447,7 +4458,7 @@ TYPE(svfloat32) test_svreinterpret_f32_s8(TYPE(svint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x float>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_s16(TYPE(svint16) op)
+TYPE(svfloat32) test_svreinterpret_f32_s16(TYPE(svint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f32,_s16)(op);
 }
@@ -4492,7 +4503,7 @@ TYPE(svfloat32) test_svreinterpret_f32_s16(TYPE(svint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 16 x float>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_s32(TYPE(svint32) op)
+TYPE(svfloat32) test_svreinterpret_f32_s32(TYPE(svint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f32,_s32)(op);
 }
@@ -4537,7 +4548,7 @@ TYPE(svfloat32) test_svreinterpret_f32_s32(TYPE(svint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x float>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_s64(TYPE(svint64) op)
+TYPE(svfloat32) test_svreinterpret_f32_s64(TYPE(svint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f32,_s64)(op);
 }
@@ -4582,7 +4593,7 @@ TYPE(svfloat32) test_svreinterpret_f32_s64(TYPE(svint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 16 x float>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_u8(TYPE(svuint8) op)
+TYPE(svfloat32) test_svreinterpret_f32_u8(TYPE(svuint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f32,_u8)(op);
 }
@@ -4627,7 +4638,7 @@ TYPE(svfloat32) test_svreinterpret_f32_u8(TYPE(svuint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 16 x float>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_u16(TYPE(svuint16) op)
+TYPE(svfloat32) test_svreinterpret_f32_u16(TYPE(svuint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f32,_u16)(op);
 }
@@ -4672,7 +4683,7 @@ TYPE(svfloat32) test_svreinterpret_f32_u16(TYPE(svuint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 16 x float>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_u32(TYPE(svuint32) op)
+TYPE(svfloat32) test_svreinterpret_f32_u32(TYPE(svuint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f32,_u32)(op);
 }
@@ -4717,7 +4728,7 @@ TYPE(svfloat32) test_svreinterpret_f32_u32(TYPE(svuint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 16 x float>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_u64(TYPE(svuint64) op)
+TYPE(svfloat32) test_svreinterpret_f32_u64(TYPE(svuint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f32,_u64)(op);
 }
@@ -4762,7 +4773,7 @@ TYPE(svfloat32) test_svreinterpret_f32_u64(TYPE(svuint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 16 x float>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_f16(TYPE(svfloat16) op)
+TYPE(svfloat32) test_svreinterpret_f32_f16(TYPE(svfloat16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f32,_f16)(op);
 }
@@ -4799,7 +4810,7 @@ TYPE(svfloat32) test_svreinterpret_f32_f16(TYPE(svfloat16) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[OP:%.*]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_f32(TYPE(svfloat32) op)
+TYPE(svfloat32) test_svreinterpret_f32_f32(TYPE(svfloat32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f32,_f32)(op);
 }
@@ -4844,7 +4855,7 @@ TYPE(svfloat32) test_svreinterpret_f32_f32(TYPE(svfloat32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x double> [[OP:%.*]] to <vscale x 16 x float>
 // CPP-TUPLE4-NEXT:    ret <vscale x 16 x float> [[TMP0]]
 //
-TYPE(svfloat32) test_svreinterpret_f32_f64(TYPE(svfloat64) op)
+TYPE(svfloat32) test_svreinterpret_f32_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f32,_f64)(op);
 }
@@ -4889,7 +4900,7 @@ TYPE(svfloat32) test_svreinterpret_f32_f64(TYPE(svfloat64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x double>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_s8(TYPE(svint8) op)
+TYPE(svfloat64) test_svreinterpret_f64_s8(TYPE(svint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_s8)(op);
 }
@@ -4934,7 +4945,7 @@ TYPE(svfloat64) test_svreinterpret_f64_s8(TYPE(svint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x double>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_s16(TYPE(svint16) op)
+TYPE(svfloat64) test_svreinterpret_f64_s16(TYPE(svint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_s16)(op);
 }
@@ -4979,7 +4990,7 @@ TYPE(svfloat64) test_svreinterpret_f64_s16(TYPE(svint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x double>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_s32(TYPE(svint32) op)
+TYPE(svfloat64) test_svreinterpret_f64_s32(TYPE(svint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_s32)(op);
 }
@@ -5024,7 +5035,7 @@ TYPE(svfloat64) test_svreinterpret_f64_s32(TYPE(svint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 8 x double>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_s64(TYPE(svint64) op)
+TYPE(svfloat64) test_svreinterpret_f64_s64(TYPE(svint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_s64)(op);
 }
@@ -5069,7 +5080,7 @@ TYPE(svfloat64) test_svreinterpret_f64_s64(TYPE(svint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 64 x i8> [[OP:%.*]] to <vscale x 8 x double>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_u8(TYPE(svuint8) op)
+TYPE(svfloat64) test_svreinterpret_f64_u8(TYPE(svuint8) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_u8)(op);
 }
@@ -5114,7 +5125,7 @@ TYPE(svfloat64) test_svreinterpret_f64_u8(TYPE(svuint8) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x i16> [[OP:%.*]] to <vscale x 8 x double>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_u16(TYPE(svuint16) op)
+TYPE(svfloat64) test_svreinterpret_f64_u16(TYPE(svuint16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_u16)(op);
 }
@@ -5159,7 +5170,7 @@ TYPE(svfloat64) test_svreinterpret_f64_u16(TYPE(svuint16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i32> [[OP:%.*]] to <vscale x 8 x double>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_u32(TYPE(svuint32) op)
+TYPE(svfloat64) test_svreinterpret_f64_u32(TYPE(svuint32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_u32)(op);
 }
@@ -5204,7 +5215,7 @@ TYPE(svfloat64) test_svreinterpret_f64_u32(TYPE(svuint32) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 8 x i64> [[OP:%.*]] to <vscale x 8 x double>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_u64(TYPE(svuint64) op)
+TYPE(svfloat64) test_svreinterpret_f64_u64(TYPE(svuint64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_u64)(op);
 }
@@ -5249,7 +5260,7 @@ TYPE(svfloat64) test_svreinterpret_f64_u64(TYPE(svuint64) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 32 x half> [[OP:%.*]] to <vscale x 8 x double>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_f16(TYPE(svfloat16) op)
+TYPE(svfloat64) test_svreinterpret_f64_f16(TYPE(svfloat16) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_f16)(op);
 }
@@ -5294,7 +5305,7 @@ TYPE(svfloat64) test_svreinterpret_f64_f16(TYPE(svfloat16) op)
 // CPP-TUPLE4-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x float> [[OP:%.*]] to <vscale x 8 x double>
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[TMP0]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_f32(TYPE(svfloat32) op)
+TYPE(svfloat64) test_svreinterpret_f64_f32(TYPE(svfloat32) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_f32)(op);
 }
@@ -5331,7 +5342,7 @@ TYPE(svfloat64) test_svreinterpret_f64_f32(TYPE(svfloat32) op)
 // CPP-TUPLE4-NEXT:  entry:
 // CPP-TUPLE4-NEXT:    ret <vscale x 8 x double> [[OP:%.*]]
 //
-TYPE(svfloat64) test_svreinterpret_f64_f64(TYPE(svfloat64) op)
+TYPE(svfloat64) test_svreinterpret_f64_f64(TYPE(svfloat64) op) MODE_ATTR
 {
   return SVE_ACLE_FUNC(svreinterpret_f64,_f64)(op);
 }
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c
deleted file mode 100644
index f27875836193..000000000000
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_reinterpret_from_streaming_mode.c
+++ /dev/null
@@ -1,35 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -S -O1 -Werror -Wall -o /dev/null %s
-
-// Note: We need to run this test with '-O1' because oddly enough the svreinterpret is always inlined at -O0.
-
-#include <arm_sve.h>
-
-#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3
-#else
-#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
-#endif
-
-// Test that svreinterpret is inlined (because it should be streaming-compatible)
-__attribute__((target("sme")))
-// CHECK-LABEL: @test_svreinterpret_s16_s8_from_streaming_mode(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x i16>
-// CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
-//
-// CPP-CHECK-LABEL: @_Z45test_svreinterpret_s16_s8_from_streaming_modeu10__SVInt8_t(
-// CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i8> [[OP:%.*]] to <vscale x 8 x i16>
-// CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
-//
-svint16_t test_svreinterpret_s16_s8_from_streaming_mode(svint8_t op) __arm_streaming {
-  return SVE_ACLE_FUNC(svreinterpret_s16,_s8,,)(op);
-}
-
diff --git a/clang/test/CodeGen/aarch64-sve-vector-subscript-ops.c b/clang/test/CodeGen/aarch64-sve-vector-subscript-ops.c
index fb60c6d100ce..52a05d010de9 100644
--- a/clang/test/CodeGen/aarch64-sve-vector-subscript-ops.c
+++ b/clang/test/CodeGen/aarch64-sve-vector-subscript-ops.c
@@ -88,3 +88,25 @@ float subscript_float32(svfloat32_t a, size_t b) {
 double subscript_float64(svfloat64_t a, size_t b) {
   return a[b];
 }
+
+// CHECK-LABEL: @subscript_write_float32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINS:%.*]] = insertelement <vscale x 4 x float> [[A:%.*]], float 1.000000e+00, i64 [[B:%.*]]
+// CHECK-NEXT:    ret <vscale x 4 x float> [[VECINS]]
+//
+svfloat32_t subscript_write_float32(svfloat32_t a, size_t b) {
+  a[b] = 1.0f;
+  return a;
+}
+
+// CHECK-LABEL: @subscript_read_write_float32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <vscale x 4 x float> [[A:%.*]], i64 [[B:%.*]]
+// CHECK-NEXT:    [[ADD:%.*]] = fadd float [[VECEXT]], 1.000000e+00
+// CHECK-NEXT:    [[VECINS:%.*]] = insertelement <vscale x 4 x float> [[A]], float [[ADD]], i64 [[B]]
+// CHECK-NEXT:    ret <vscale x 4 x float> [[VECINS]]
+//
+svfloat32_t subscript_read_write_float32(svfloat32_t a, size_t b) {
+  a[b] += 1.0f;
+  return a;
+}
diff --git a/clang/test/CodeGen/assume_attr.c b/clang/test/CodeGen/assume_attr.c
deleted file mode 100644
index 338a625188af..000000000000
--- a/clang/test/CodeGen/assume_attr.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// RUN: %clang_cc1 -emit-llvm -triple i386-linux-gnu %s -o - | FileCheck %s
-// RUN: %clang_cc1 -x c -emit-pch -o %t %s
-// RUN: %clang_cc1 -include-pch %t %s -emit-llvm -o - | FileCheck %s
-
-// TODO: for "foo" and "bar", "after" is not added as it appears "after" the first use or definition respectively. There might be a way to allow that.
-
-// CHECK:   define{{.*}} void @bar() #0
-// CHECK:   define{{.*}} void @baz() #1
-// CHECK:   declare{{.*}} void @foo() #2
-// CHECK:      attributes #0
-// CHECK-SAME:   "llvm.assume"="bar:before1,bar:before2,bar:before3,bar:def1,bar:def2"
-// CHECK:      attributes #1
-// CHECK-SAME:   "llvm.assume"="baz:before1,baz:before2,baz:before3,baz:def1,baz:def2,baz:after"
-// CHECK:      attributes #2
-// CHECK-SAME:   "llvm.assume"="foo:before1,foo:before2,foo:before3"
-
-#ifndef HEADER
-#define HEADER
-
-/// foo: declarations only
-
-__attribute__((assume("foo:before1"))) void foo(void);
-
-__attribute__((assume("foo:before2")))
-__attribute__((assume("foo:before3"))) void
-foo(void);
-
-/// baz: static function declarations and a definition
-
-__attribute__((assume("baz:before1"))) static void baz(void);
-
-__attribute__((assume("baz:before2")))
-__attribute__((assume("baz:before3"))) static void
-baz(void);
-
-// Definition
-__attribute__((assume("baz:def1,baz:def2"))) static void baz(void) { foo(); }
-
-__attribute__((assume("baz:after"))) static void baz(void);
-
-/// bar: external function declarations and a definition
-
-__attribute__((assume("bar:before1"))) void bar(void);
-
-__attribute__((assume("bar:before2")))
-__attribute__((assume("bar:before3"))) void
-bar(void);
-
-// Definition
-__attribute__((assume("bar:def1,bar:def2"))) void bar(void) { baz(); }
-
-__attribute__((assume("bar:after"))) void bar(void);
-
-/// back to foo
-
-__attribute__((assume("foo:after"))) void foo(void);
-
-#endif
diff --git a/clang/test/CodeGen/attr-cpuspecific.c b/clang/test/CodeGen/attr-cpuspecific.c
index 2c3e6931800c..628892d5809b 100644
--- a/clang/test/CodeGen/attr-cpuspecific.c
+++ b/clang/test/CodeGen/attr-cpuspecific.c
@@ -75,8 +75,8 @@ void TwoVersions(void);
 // LINUX: define weak_odr ptr @TwoVersions.resolver()
 // LINUX: call void @__cpu_indicator_init
 // LINUX: %[[FEAT_INIT:.+]] = load i32, ptr getelementptr inbounds ({ i32, i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 3, i32 0), align 4
-// LINUX: %[[FEAT_JOIN:.+]] = and i32 %[[FEAT_INIT]], 59754495
-// LINUX: %[[FEAT_CHECK:.+]] = icmp eq i32 %[[FEAT_JOIN]], 59754495
+// LINUX: %[[FEAT_JOIN:.+]] = and i32 %[[FEAT_INIT]], 9422847
+// LINUX: %[[FEAT_CHECK:.+]] = icmp eq i32 %[[FEAT_JOIN]], 9422847
 // LINUX: ret ptr @TwoVersions.Z
 // LINUX: ret ptr @TwoVersions.S
 // LINUX: call void @llvm.trap
@@ -85,8 +85,8 @@ void TwoVersions(void);
 // WINDOWS: define weak_odr dso_local void @TwoVersions() comdat
 // WINDOWS: call void @__cpu_indicator_init()
 // WINDOWS: %[[FEAT_INIT:.+]] = load i32, ptr getelementptr inbounds ({ i32, i32, i32, [1 x i32] }, ptr @__cpu_model, i32 0, i32 3, i32 0), align 4
-// WINDOWS: %[[FEAT_JOIN:.+]] = and i32 %[[FEAT_INIT]], 59754495
-// WINDOWS: %[[FEAT_CHECK:.+]] = icmp eq i32 %[[FEAT_JOIN]], 59754495
+// WINDOWS: %[[FEAT_JOIN:.+]] = and i32 %[[FEAT_INIT]], 9422847
+// WINDOWS: %[[FEAT_CHECK:.+]] = icmp eq i32 %[[FEAT_JOIN]], 9422847
 // WINDOWS: call void @TwoVersions.Z()
 // WINDOWS-NEXT: ret void
 // WINDOWS: call void @TwoVersions.S()
@@ -354,7 +354,7 @@ void OrderDispatchUsageSpecific(void) {}
 
 // CHECK: attributes #[[S]] = {{.*}}"target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
 // CHECK-SAME: "tune-cpu"="ivybridge"
-// CHECK: attributes #[[K]] = {{.*}}"target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+evex512,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
+// CHECK: attributes #[[K]] = {{.*}}"target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cmov,+crc32,+cx16,+cx8,+evex512,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
 // CHECK-SAME: "tune-cpu"="knl"
 // CHECK: attributes #[[O]] = {{.*}}"target-features"="+cmov,+cx16,+cx8,+fxsr,+mmx,+movbe,+sahf,+sse,+sse2,+sse3,+ssse3,+x87"
 // CHECK-SAME: "tune-cpu"="atom"
diff --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c
index 304398678216..3c2b511157f9 100644
--- a/clang/test/CodeGen/attr-target-x86.c
+++ b/clang/test/CodeGen/attr-target-x86.c
@@ -59,9 +59,9 @@ void __attribute__((target("avx10.1-512"))) avx10_1_512(void) {}
 // CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87" "tune-cpu"="i686"
 // CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
 // CHECK-NOT: tune-cpu
-// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-aes,-avx,-avx10.1-256,-avx10.1-512,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sha512,-sm3,-sm4,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
+// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-aes,-avx,-avx10.1-256,-avx10.1-512,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sha512,-sm3,-sm4,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
 // CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+crc32,+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686"
-// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-avx,-avx10.1-256,-avx10.1-512,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-sha512,-sm3,-sm4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
+// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-avx,-avx10.1-256,-avx10.1-512,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512fp16,-avx512ifma,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxifma,-avxneconvert,-avxvnni,-avxvnniint16,-avxvnniint8,-f16c,-fma,-fma4,-sha512,-sm3,-sm4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
 // CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cmov,+crc32,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-avx10.1-256,-avx10.1-512,-vaes"
 // CHECK-NOT: tune-cpu
 // CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cmov,+cx8,+x87,-3dnow,-3dnowa,-mmx"
diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
index bcb15969de1c..93a6ab06081c 100644
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@@ -11,6 +11,7 @@ typedef unsigned char u8x16 __attribute((vector_size(16)));
 typedef unsigned short u16x8 __attribute((vector_size(16)));
 typedef unsigned int u32x4 __attribute((vector_size(16)));
 typedef unsigned long long u64x2 __attribute((vector_size(16)));
+typedef __fp16 f16x8 __attribute((vector_size(16)));
 typedef float f32x4 __attribute((vector_size(16)));
 typedef double f64x2 __attribute((vector_size(16)));
 
@@ -813,6 +814,17 @@ void store_f16_f32(float val, __fp16 *addr) {
   // WEBASSEMBLY-NEXT: ret
 }
 
+f16x8 splat_f16x8(float a) {
+  // WEBASSEMBLY: %0 = tail call <8 x half> @llvm.wasm.splat.f16x8(float %a)
+  // WEBASSEMBLY-NEXT: ret <8 x half> %0
+  return __builtin_wasm_splat_f16x8(a);
+}
+
+float extract_lane_f16x8(f16x8 a, int i) {
+  // WEBASSEMBLY:  %0 = tail call float @llvm.wasm.extract.lane.f16x8(<8 x half> %a, i32 %i)
+  // WEBASSEMBLY-NEXT: ret float %0
+  return __builtin_wasm_extract_lane_f16x8(a, i);
+}
 __externref_t externref_null() {
   return __builtin_wasm_ref_null_extern();
   // WEBASSEMBLY: tail call ptr addrspace(10) @llvm.wasm.ref.null.extern()
diff --git a/clang/test/CodeGen/darwin-target-variant.c b/clang/test/CodeGen/darwin-target-variant.c
index 36caaaec1bdb..9f4b36a790db 100644
--- a/clang/test/CodeGen/darwin-target-variant.c
+++ b/clang/test/CodeGen/darwin-target-variant.c
@@ -2,5 +2,5 @@
 
 // CHECK: !llvm.module.flags = !{!0, !1, !2
 // CHECK: !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 1]}
-// CHECK: !1 = !{i32 4, !"darwin.target_variant.triple", !"x86_64-apple-ios14-macabi"}
+// CHECK: !1 = !{i32 2, !"darwin.target_variant.triple", !"x86_64-apple-ios14-macabi"}
 // CHECK: !2 = !{i32 2, !"darwin.target_variant.SDK Version", [2 x i32] [i32 14, i32 1]}
diff --git a/clang/test/CodeGen/fat-lto-objects.c b/clang/test/CodeGen/fat-lto-objects.c
index b50567c024fc..36a73684e7bf 100644
--- a/clang/test/CodeGen/fat-lto-objects.c
+++ b/clang/test/CodeGen/fat-lto-objects.c
@@ -62,7 +62,7 @@
 
 // ELF: .llvm.lto
 
-//      ASM: .section        .llvm.lto,"e",@progbits
+//      ASM: .section        .llvm.lto,"e",@llvm_lto
 // ASM-NEXT: .Lllvm.embedded.object:
 // ASM-NEXT:        .asciz  "BC
 // ASM-NEXT: .size   .Lllvm.embedded.object
diff --git a/clang/test/CodeGen/function-target-features.c b/clang/test/CodeGen/function-target-features.c
index 0d8bfc7e4e44..d6a73ff8224b 100644
--- a/clang/test/CodeGen/function-target-features.c
+++ b/clang/test/CodeGen/function-target-features.c
@@ -4,7 +4,7 @@
 
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-feature +avx | FileCheck %s -check-prefix=AVX-FEATURE
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-feature +avx | FileCheck %s -check-prefix=AVX-NO-CPU
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-feature +avx512f -target-feature +avx512er | FileCheck %s -check-prefix=TWO-AVX
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-feature +avx512f -target-feature +avx512bw | FileCheck %s -check-prefix=TWO-AVX
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-cpu corei7 | FileCheck %s -check-prefix=CORE-CPU
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-cpu corei7 -target-feature +avx | FileCheck %s -check-prefix=CORE-CPU-AND-FEATURES
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-cpu x86-64 | FileCheck %s -check-prefix=X86-64-CPU
@@ -17,7 +17,7 @@ void foo(void) {}
 
 // AVX-FEATURE: "target-features"{{.*}}+avx
 // AVX-NO-CPU-NOT: target-cpu
-// TWO-AVX: "target-features"={{.*}}+avx512er{{.*}}+avx512f
+// TWO-AVX: "target-features"={{.*}}+avx512bw{{.*}}+avx512f
 // CORE-CPU: "target-cpu"="corei7"
 // CORE-CPU-AND-FEATURES: "target-cpu"="corei7" "target-features"={{.*}}+avx
 // X86-64-CPU: "target-cpu"="x86-64"
diff --git a/clang/test/CodeGen/functions.c b/clang/test/CodeGen/functions.c
index 1bbaa80d653c..0cc999aa4916 100644
--- a/clang/test/CodeGen/functions.c
+++ b/clang/test/CodeGen/functions.c
@@ -61,3 +61,15 @@ static void test9_helper(void) {}
 void test9(void) {
   (void) test9_helper;
 }
+
+// PR88917: don't crash
+int b();
+
+int main() {
+	return b(b);
+	// CHECK: call i32 @b(ptr noundef @b)
+}
+int b(int (*f)()){
+  return 0;
+}
+// CHECK-LABEL: define{{.*}} i32 @b(ptr noundef %f)
diff --git a/clang/test/CodeGen/target-builtin-noerror.c b/clang/test/CodeGen/target-builtin-noerror.c
index b438e50848a4..2e16fd8b9fe4 100644
--- a/clang/test/CodeGen/target-builtin-noerror.c
+++ b/clang/test/CodeGen/target-builtin-noerror.c
@@ -68,8 +68,6 @@ void verifyfeaturestrings(void) {
   (void)__builtin_cpu_supports("avx512bw");
   (void)__builtin_cpu_supports("avx512dq");
   (void)__builtin_cpu_supports("avx512cd");
-  (void)__builtin_cpu_supports("avx512er");
-  (void)__builtin_cpu_supports("avx512pf");
   (void)__builtin_cpu_supports("avx512vbmi");
   (void)__builtin_cpu_supports("avx512ifma");
   (void)__builtin_cpu_supports("avx5124vnniw");
diff --git a/clang/test/CodeGenCXX/assume_attr.cpp b/clang/test/CodeGenCXX/assume_attr.cpp
index dbe76501377c..962dcc470f67 100644
--- a/clang/test/CodeGenCXX/assume_attr.cpp
+++ b/clang/test/CodeGenCXX/assume_attr.cpp
@@ -8,77 +8,77 @@
 
 /// foo: declarations only
 
-__attribute__((assume("foo:before1"))) void foo();
+[[omp::assume("foo:before1")]] void foo();
 
-__attribute__((assume("foo:before2")))
-__attribute__((assume("foo:before3"))) void
+[[omp::assume("foo:before2")]]
+[[omp::assume("foo:before3")]] void
 foo();
 
 /// baz: static function declarations and a definition
 
-__attribute__((assume("baz:before1"))) static void baz();
+[[omp::assume("baz:before1")]] static void baz();
 
-__attribute__((assume("baz:before2")))
-__attribute__((assume("baz:before3"))) static void
+[[omp::assume("baz:before2")]]
+[[omp::assume("baz:before3")]] static void
 baz();
 
 // Definition
-__attribute__((assume("baz:def1,baz:def2"))) static void baz() { foo(); }
+[[omp::assume("baz:def1,baz:def2")]] static void baz() { foo(); }
 
-__attribute__((assume("baz:after"))) static void baz();
+[[omp::assume("baz:after")]] static void baz();
 
 /// bar: external function declarations and a definition
 
-__attribute__((assume("bar:before1"))) void bar();
+[[omp::assume("bar:before1")]] void bar();
 
-__attribute__((assume("bar:before2")))
-__attribute__((assume("bar:before3"))) void
+[[omp::assume("bar:before2")]]
+[[omp::assume("bar:before3")]] void
 bar();
 
 // Definition
-__attribute__((assume("bar:def1,bar:def2"))) void bar() { baz(); }
+[[omp::assume("bar:def1,bar:def2")]] void bar() { baz(); }
 
-__attribute__((assume("bar:after"))) void bar();
+[[omp::assume("bar:after")]] void bar();
 
 /// back to foo
 
-__attribute__((assume("foo:after"))) void foo();
+[[omp::assume("foo:after")]] void foo();
 
 /// class tests
 class C {
-  __attribute__((assume("C:private_method"))) void private_method();
-  __attribute__((assume("C:private_static"))) static void private_static();
+  [[omp::assume("C:private_method")]] void private_method();
+  [[omp::assume("C:private_static")]] static void private_static();
 
 public:
-  __attribute__((assume("C:public_method1"))) void public_method();
-  __attribute__((assume("C:public_static1"))) static void public_static();
+  [[omp::assume("C:public_method1")]] void public_method();
+  [[omp::assume("C:public_static1")]] static void public_static();
 };
 
-__attribute__((assume("C:public_method2"))) void C::public_method() {
+[[omp::assume("C:public_method2")]] void C::public_method() {
   private_method();
 }
 
-__attribute__((assume("C:public_static2"))) void C::public_static() {
+[[omp::assume("C:public_static2")]] void C::public_static() {
   private_static();
 }
 
 /// template tests
 template <typename T>
-__attribute__((assume("template_func<T>"))) void template_func() {}
+[[omp::assume("template_func<T>")]] void template_func() {}
 
 template <>
-__attribute__((assume("template_func<float>"))) void template_func<float>() {}
+[[omp::assume("template_func<float>")]] void template_func<float>() {}
 
 template <>
 void template_func<int>() {}
 
 template <typename T>
 struct S {
-  __attribute__((assume("S<T>::method"))) void method();
+  [[omp::assume("S<T>::method")]] void method();
 };
 
 template <>
-__attribute__((assume("S<float>::method"))) void S<float>::method() {}
+[[omp::assume("S<float>::method")]] void S<float>::method() {}
 
 template <>
 void S<int>::method() {}
diff --git a/clang/test/CodeGenCXX/cxx1y-sized-deallocation.cpp b/clang/test/CodeGenCXX/cxx1y-sized-deallocation.cpp
index 4e1565725152..55913aff9c19 100644
--- a/clang/test/CodeGenCXX/cxx1y-sized-deallocation.cpp
+++ b/clang/test/CodeGenCXX/cxx1y-sized-deallocation.cpp
@@ -1,12 +1,12 @@
 // Check that delete exprs call the sized deallocation function if
-// -fsized-deallocation is passed in both C++11 and C++14.
+// -fsized-deallocation is passed in C++11 or std >= C++14.
 // RUN: %clang_cc1 -std=c++11 -fsized-deallocation %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s
-// RUN: %clang_cc1 -std=c++14 -fsized-deallocation %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s
+// RUN: %clang_cc1 -std=c++14 %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s
 
-// Check that we don't used sized deallocation without -fsized-deallocation and
-// C++14.
+// Check that we don't used sized deallocation with -fno-sized-deallocation or without C++14.
 // RUN: %clang_cc1 -std=c++11 %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s --check-prefix=CHECK-UNSIZED
-// RUN: %clang_cc1 -std=c++14 %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s --check-prefix=CHECK-UNSIZED
+// RUN: %clang_cc1 -std=c++14 %s -emit-llvm -triple x86_64-linux-gnu -fno-sized-deallocation -o - \
+// RUN:     | FileCheck %s --check-prefix=CHECK-UNSIZED
 
 // CHECK-UNSIZED-NOT: _ZdlPvm
 // CHECK-UNSIZED-NOT: _ZdaPvm
diff --git a/clang/test/CodeGenCXX/cxx1z-aligned-allocation.cpp b/clang/test/CodeGenCXX/cxx1z-aligned-allocation.cpp
index ab2e4b3cdbbf..8823bc64a436 100644
--- a/clang/test/CodeGenCXX/cxx1z-aligned-allocation.cpp
+++ b/clang/test/CodeGenCXX/cxx1z-aligned-allocation.cpp
@@ -1,10 +1,10 @@
 // Check that delete exprs call aligned (de)allocation functions if
 // -faligned-allocation is passed in both C++11 and C++14.
 // RUN: %clang_cc1 -std=c++11 -fexceptions -fsized-deallocation -faligned-allocation %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s
-// RUN: %clang_cc1 -std=c++14 -fexceptions -fsized-deallocation -faligned-allocation %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s
-// RUN: %clang_cc1 -std=c++1z -fexceptions -fsized-deallocation %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s
+// RUN: %clang_cc1 -std=c++14 -fexceptions -faligned-allocation %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s
+// RUN: %clang_cc1 -std=c++1z -fexceptions %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s
 
-// RUN: %clang_cc1 -std=c++1z -fexceptions -fsized-deallocation %s -emit-llvm -triple x86_64-windows-msvc -o - | FileCheck %s --check-prefix=CHECK-MS
+// RUN: %clang_cc1 -std=c++1z -fexceptions %s -emit-llvm -triple x86_64-windows-msvc -o - | FileCheck %s --check-prefix=CHECK-MS
 
 // Check that we don't used aligned (de)allocation without -faligned-allocation or C++1z.
 // RUN: %clang_cc1 -std=c++14 -DUNALIGNED -fexceptions %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s --check-prefix=CHECK-UNALIGNED
diff --git a/clang/test/CodeGenCXX/cxx2a-destroying-delete.cpp b/clang/test/CodeGenCXX/cxx2a-destroying-delete.cpp
index 20264b67353a..f6f4a2ff735c 100644
--- a/clang/test/CodeGenCXX/cxx2a-destroying-delete.cpp
+++ b/clang/test/CodeGenCXX/cxx2a-destroying-delete.cpp
@@ -108,10 +108,10 @@ struct J {
 // CHECK-MSABI-LABEL: define {{.*}}@"?j@@
 J *j() {
   // CHECK-ITANIUM: invoke {{.*}}@_ZN1JC1Ev(
-  // CHECK-ITANIUM: call {{.*}}@_ZdlPv(
+  // CHECK-ITANIUM: call {{.*}}@_ZdlPvm(
   // CHECK-NOT: }
   // CHECK-MSABI: invoke {{.*}}@"??0J@@Q{{AE|EAA}}@XZ"(
-  // CHECK-MSABI: call {{.*}}@"??3@YAXP{{E?}}AX@Z"(
+  // CHECK-MSABI: call {{.*}}@"??3@YAXP{{E?}}AX{{I|_K}}@Z"(
   return new J;
   // CHECK: }
 }
diff --git a/clang/test/CodeGenCXX/cxx2b-deducing-this.cpp b/clang/test/CodeGenCXX/cxx2b-deducing-this.cpp
index b755e80db35a..649fe2afbf4e 100644
--- a/clang/test/CodeGenCXX/cxx2b-deducing-this.cpp
+++ b/clang/test/CodeGenCXX/cxx2b-deducing-this.cpp
@@ -182,3 +182,66 @@ auto dothing(int num)
   fun();
 }
 }
+
+namespace GH87210 {
+template <typename... Ts>
+struct Overloaded : Ts... {
+  using Ts::operator()...;
+};
+
+template <typename... Ts>
+Overloaded(Ts...) -> Overloaded<Ts...>;
+
+// CHECK-LABEL: define dso_local void @_ZN7GH872101fEv()
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[X:%.*]] = alloca i32
+// CHECK-NEXT:    [[Over:%.*]] = alloca %"{{.*}}Overloaded"
+// CHECK:         call noundef ptr @"_ZZN7GH872101fEvENH3$_0clINS_10OverloadedIJS0_EEEEEDaRT_"(ptr {{.*}} [[Over]])
+void f() {
+  int x;
+  Overloaded o {
+    // CHECK: define internal noundef ptr @"_ZZN7GH872101fEvENH3$_0clINS_10OverloadedIJS0_EEEEEDaRT_"(ptr {{.*}} [[Self:%.*]])
+    // CHECK-NEXT:  entry:
+    // CHECK-NEXT:    [[SelfAddr:%.*]] = alloca ptr
+    // CHECK-NEXT:    store ptr [[Self]], ptr [[SelfAddr]]
+    // CHECK-NEXT:    [[SelfPtr:%.*]] = load ptr, ptr [[SelfAddr]]
+    // CHECK-NEXT:    [[XRef:%.*]] = getelementptr inbounds %{{.*}}, ptr [[SelfPtr]], i32 0, i32 0
+    // CHECK-NEXT:    [[X:%.*]] = load ptr, ptr [[XRef]]
+    // CHECK-NEXT:    ret ptr [[X]]
+    [&](this auto& self) {
+      return &x;
+    }
+  };
+  o();
+}
+
+void g() {
+  int x;
+  Overloaded o {
+    [=](this auto& self) {
+      return x;
+    }
+  };
+  o();
+}
+}
+
+namespace GH89541 {
+// Same as above; just check that this doesn't crash.
+int one = 1;
+auto factory(int& x = one) {
+  return [&](this auto self) {
+    x;
+  };
+};
+
+using Base = decltype(factory());
+struct Derived : Base {
+  Derived() : Base(factory()) {}
+};
+
+void f() {
+  Derived d;
+  d();
+}
+}
diff --git a/clang/test/CodeGenCXX/delete-two-arg.cpp b/clang/test/CodeGenCXX/delete-two-arg.cpp
index 552634f430a8..a0dcd03bc5a9 100644
--- a/clang/test/CodeGenCXX/delete-two-arg.cpp
+++ b/clang/test/CodeGenCXX/delete-two-arg.cpp
@@ -43,7 +43,9 @@ namespace test2 {
     // CHECK-NEXT: br i1 [[T1]],
     // CHECK: [[T3:%.*]] = getelementptr inbounds i8, ptr [[T0]], i32 -4
     // CHECK-NEXT: [[T5:%.*]] = load i32, ptr [[T3]]
-    // CHECK-NEXT: call void @_ZdaPv(ptr noundef [[T3]])
+    // CHECK-NEXT: [[T6:%.*]] = mul i32 4, [[T5]]
+    // CHECK-NEXT: [[T7:%.*]] = add i32 [[T6]], 4
+    // CHECK-NEXT: call void @_ZdaPvj(ptr noundef [[T3]], i32 noundef [[T7]])
     // CHECK-NEXT: br label
     ::delete[] p;
   }
diff --git a/clang/test/CodeGenCXX/delete.cpp b/clang/test/CodeGenCXX/delete.cpp
index 1a418f48b659..d5b0dc671291 100644
--- a/clang/test/CodeGenCXX/delete.cpp
+++ b/clang/test/CodeGenCXX/delete.cpp
@@ -16,7 +16,7 @@ void t3(S *s) {
   // CHECK: icmp {{.*}} null
   // CHECK: br i1
 
-  // CHECK: call void @_ZdlPv
+  // CHECK: call void @_ZdlPvm
 
   // Check the delete is inside the 'if !null' check unless we're optimizing
   // for size. FIXME: We could omit the branch entirely in this case.
@@ -35,7 +35,7 @@ struct T {
 void t4(T *t) {
   // CHECK: call void @_ZN1TD1Ev
   // CHECK-SIZE-NEXT: br
-  // CHECK: call void @_ZdlPv
+  // CHECK: call void @_ZdlPvm
   delete t;
 }
 
@@ -93,14 +93,16 @@ namespace test1 {
     // CHECK-NEXT: call void @_ZN5test11AD1Ev(ptr {{[^,]*}} [[CUR]])
     // CHECK-NEXT: [[ISDONE:%.*]] = icmp eq ptr [[CUR]], [[BEGIN]]
     // CHECK-NEXT: br i1 [[ISDONE]]
-    // CHECK:      call void @_ZdaPv(ptr noundef [[ALLOC]])
+    // CHECK:      [[MUL:%.*]] = mul i64 4, [[COUNT]]
+    // CHECK-NEXT: [[SIZE:%.*]] = add i64 [[MUL]], 8
+    // CHECK-NEXT: call void @_ZdaPvm(ptr noundef [[ALLOC]], i64 noundef [[SIZE]])
   }
 }
 
 namespace test2 {
   // CHECK-LABEL: define{{.*}} void @_ZN5test21fEPb
   void f(bool *b) {
-    // CHECK: call void @_ZdlPv(ptr
+    // CHECK: call void @_ZdlPvm(ptr{{.*}}i64
     delete b;
     // CHECK: call void @_ZdaPv(ptr
     delete [] b;
@@ -137,7 +139,7 @@ namespace test4 {
     // CHECK-NEXT: [[DTOR:%.*]] = load ptr, ptr [[T0]]
     // CHECK-NEXT: call void [[DTOR]](ptr {{[^,]*}} [[OBJ:%.*]])
     //   Call the global operator delete.
-    // CHECK-NEXT: call void @_ZdlPv(ptr noundef [[ALLOCATED]]) [[NUW:#[0-9]+]]
+    // CHECK-NEXT: call void @_ZdlPvm(ptr noundef [[ALLOCATED]], i64 noundef 8) [[NUW:#[0-9]+]]
     ::delete xp;
   }
 }
diff --git a/clang/test/CodeGenCXX/dllimport.cpp b/clang/test/CodeGenCXX/dllimport.cpp
index 6fec2f2982d4..484866b45389 100644
--- a/clang/test/CodeGenCXX/dllimport.cpp
+++ b/clang/test/CodeGenCXX/dllimport.cpp
@@ -205,7 +205,7 @@ USEVAR(VarTmpl<ExplicitSpec_Imported>)
 // Functions
 //===----------------------------------------------------------------------===//
 
-// GNU-DAG: declare dso_local void @_ZdlPv(ptr)
+// GNU-DAG: declare dso_local void @_ZdlPv{{j|y}}(ptr, i{{32|64}})
 
 // Import function declaration.
 // MSC-DAG: declare dllimport void @"?decl@@YAXXZ"()
@@ -358,7 +358,7 @@ __declspec(dllimport) void operator delete(void*);
 __declspec(dllimport) inline int *ReferencingImportedNew() { return new int[2]; }
 // MO1-DAG: define available_externally dllimport ptr @"?ReferencingImportedNew@@YAPAHXZ"
 __declspec(dllimport) inline int *ReferencingImportedDelete() { delete (int*)nullptr; }
-// MO1-DAG: define available_externally dllimport ptr @"?ReferencingImportedDelete@@YAPAHXZ"
+// MO1-DAG: declare dllimport ptr @"?ReferencingImportedDelete@@YAPAHXZ"
 USE(ReferencingImportedNew)
 USE(ReferencingImportedDelete)
 struct ClassWithDtor { ~ClassWithDtor() {} };
diff --git a/clang/test/CodeGenCXX/fmv-namespace.cpp b/clang/test/CodeGenCXX/fmv-namespace.cpp
new file mode 100644
index 000000000000..5bcd0da06eeb
--- /dev/null
+++ b/clang/test/CodeGenCXX/fmv-namespace.cpp
@@ -0,0 +1,93 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --version 5
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -emit-llvm %s -o - | FileCheck %s
+
+namespace Name {
+int __attribute((target_version("default"))) foo() { return 0; }
+}
+
+namespace Name {
+int __attribute((target_version("sve"))) foo() { return 1; }
+}
+
+int bar() { return Name::foo(); }
+
+namespace OtherName {
+int __attribute((target_version("sve"))) foo() { return 2; }
+}
+
+int baz() { return OtherName::foo(); }
+
+//.
+// CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
+// CHECK: @_ZN4Name3fooEv.ifunc = weak_odr alias i32 (), ptr @_ZN4Name3fooEv
+// CHECK: @_ZN9OtherName3fooEv.ifunc = weak_odr alias i32 (), ptr @_ZN9OtherName3fooEv
+// CHECK: @_ZN4Name3fooEv = weak_odr ifunc i32 (), ptr @_ZN4Name3fooEv.resolver
+// CHECK: @_ZN9OtherName3fooEv = weak_odr ifunc i32 (), ptr @_ZN9OtherName3fooEv.resolver
+//.
+// CHECK-LABEL: define dso_local noundef i32 @_ZN4Name3fooEv.default(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK-LABEL: define dso_local noundef i32 @_ZN4Name3fooEv._Msve(
+// CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret i32 1
+//
+//
+// CHECK-LABEL: define dso_local noundef i32 @_Z3barv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN4Name3fooEv()
+// CHECK-NEXT:    ret i32 [[CALL]]
+//
+//
+// CHECK-LABEL: define weak_odr ptr @_ZN4Name3fooEv.resolver() comdat {
+// CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1073741824
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1073741824
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
+// CHECK:       [[RESOLVER_RETURN]]:
+// CHECK-NEXT:    ret ptr @_ZN4Name3fooEv._Msve
+// CHECK:       [[RESOLVER_ELSE]]:
+// CHECK-NEXT:    ret ptr @_ZN4Name3fooEv.default
+//
+//
+// CHECK-LABEL: define dso_local noundef i32 @_ZN9OtherName3fooEv._Msve(
+// CHECK-SAME: ) #[[ATTR1]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret i32 2
+//
+//
+// CHECK-LABEL: define dso_local noundef i32 @_Z3bazv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN9OtherName3fooEv()
+// CHECK-NEXT:    ret i32 [[CALL]]
+//
+//
+// CHECK-LABEL: define weak_odr ptr @_ZN9OtherName3fooEv.resolver() comdat {
+// CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1073741824
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1073741824
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
+// CHECK:       [[RESOLVER_RETURN]]:
+// CHECK-NEXT:    ret ptr @_ZN9OtherName3fooEv._Msve
+// CHECK:       [[RESOLVER_ELSE]]:
+// CHECK-NEXT:    ret ptr @_ZN9OtherName3fooEv.default
+//
+//.
+// CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR1]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve" }
+// CHECK: attributes #[[ATTR2:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/CodeGenCXX/new.cpp b/clang/test/CodeGenCXX/new.cpp
index e278d9acfe9e..af225529c494 100644
--- a/clang/test/CodeGenCXX/new.cpp
+++ b/clang/test/CodeGenCXX/new.cpp
@@ -15,7 +15,7 @@ void t1() {
 }
 
 // CHECK: declare noundef nonnull ptr @_Znwm(i64 noundef) [[ATTR_NOBUILTIN:#[^ ]*]]
-// CHECK: declare void @_ZdlPv(ptr noundef) [[ATTR_NOBUILTIN_NOUNWIND:#[^ ]*]]
+// CHECK: declare void @_ZdlPvm(ptr noundef, i64 noundef) [[ATTR_NOBUILTIN_NOUNWIND:#[^ ]*]]
 // CHECK: declare noundef nonnull ptr @_Znam(i64 noundef) [[ATTR_NOBUILTIN]]
 // CHECK: declare void @_ZdaPv(ptr noundef) [[ATTR_NOBUILTIN_NOUNWIND]]
 
@@ -192,7 +192,7 @@ void f() {
   // CHECK: store i64 200
   delete[] new (nothrow) Alloc[10][20];
   // CHECK: call noalias noundef nonnull ptr @_Znwm
-  // CHECK: call void @_ZdlPv(ptr
+  // CHECK: call void @_ZdlPvm(ptr noundef {{%.*}}, i64 noundef 1)
   delete new bool;
   // CHECK: ret void
 }
@@ -317,7 +317,7 @@ namespace N3664 {
   void f() {
     // CHECK: call noalias noundef nonnull ptr @_Znwm(i64 noundef 4) [[ATTR_BUILTIN_NEW:#[^ ]*]]
     int *p = new int; // expected-note {{allocated with 'new' here}}
-    // CHECK: call void @_ZdlPv({{.*}}) [[ATTR_BUILTIN_DELETE:#[^ ]*]]
+    // CHECK: call void @_ZdlPvm({{.*}}) [[ATTR_BUILTIN_DELETE:#[^ ]*]]
     delete p;
 
     // CHECK: call noalias noundef nonnull ptr @_Znam(i64 noundef 12) [[ATTR_BUILTIN_NEW]]
diff --git a/clang/test/CodeGenCXX/ps-dllstorage-vtable-rtti.cpp b/clang/test/CodeGenCXX/ps-dllstorage-vtable-rtti.cpp
new file mode 100644
index 000000000000..377e579058ac
--- /dev/null
+++ b/clang/test/CodeGenCXX/ps-dllstorage-vtable-rtti.cpp
@@ -0,0 +1,114 @@
+/// For a class that has a vtable and typeinfo symbol for RTTI, if a user marks
+/// either:
+///
+///   (a) The entire class as dllexport (dllimport)
+///   (b) Any non-inline method of the class as dllexport (dllimport)
+///
+/// then Clang must export the vtable and typeinfo symbol from the TU where they
+/// are defined (the TU containing the definition of the Itanium C++ ABI "key
+/// function") and must import them in other modules where they are referenced.
+
+// RUN: %clang_cc1 -I%S -fdeclspec -triple x86_64-unknown-windows-itanium -emit-llvm -o - %s -fhalf-no-semantic-interposition \
+// RUN:   | FileCheck %s -check-prefix=WI
+// RUN: %clang_cc1 -I%S -fdeclspec -triple x86_64-scei-windows-itanium    -emit-llvm -o - %s -fhalf-no-semantic-interposition \
+// RUN:   | FileCheck %s --check-prefixes=PS
+// RUN: %clang_cc1 -I%S -fdeclspec -triple x86_64-scei-ps4 -emit-llvm -o - %s -fhalf-no-semantic-interposition \
+// RUN:   | FileCheck %s --check-prefixes=PS
+// RUN: %clang_cc1 -I%S -fdeclspec -triple x86_64-sie-ps5  -emit-llvm -o - %s -fhalf-no-semantic-interposition \
+// RUN:   | FileCheck %s --check-prefixes=PS
+
+#include <typeinfo>
+
+/// Case (a) -- Import Aspect
+/// The entire class is imported. The typeinfo symbol must also be imported, but
+/// the vtable will not be referenced, and so does not need to be imported.
+
+// PS-DAG: @_ZTI10FullImport = {{.*}}dllimport
+// WI-DAG: @_ZTI10FullImport = external dllimport constant ptr
+struct __declspec(dllimport) FullImport {
+  virtual void inlineFunc() const {}
+  virtual void key();
+  virtual void func();
+};
+
+/// 'FullImport::key()' is the key function, so the vtable and typeinfo symbol
+/// of 'FullImport' will be defined in the TU that contains the definition of
+/// 'key()' (and they must be exported from there).
+void FullImportTest() { typeid(FullImport).name(); }
+
+/// Case (a) -- Export Aspect
+/// The entire class is exported. The vtable and typeinfo symbols must also be
+/// exported.
+
+// PS-DAG: @_ZTV10FullExport = {{.*}}dllexport
+// WI-DAG: @_ZTV10FullExport = {{.*}}dllexport
+// PS-DAG: @_ZTI10FullExport = {{.*}}dllexport
+// WI-DAG: @_ZTI10FullExport = dso_local dllexport constant {
+struct __declspec(dllexport) FullExport {
+  virtual void inlineFunc() const {}
+  virtual void key();
+  virtual void func();
+};
+
+/// This is the key function of the class 'FullExport', so the vtable and
+/// typeinfo symbols of 'FullExport' will be defined in this TU, and so they
+/// must be exported from this TU.
+void FullExport::key() { typeid(FullExport).name(); }
+
+/// Case (b) -- Import Aspect
+/// The class as a whole is not imported, but a non-inline method of the class
+/// is, so the vtable and typeinfo symbol must be imported.
+
+// PS-DAG: @_ZTV10PartImport = {{.*}}dllimport
+// WI-DAG: @_ZTV10PartImport = external dso_local unnamed_addr constant {
+// PS-DAG: @_ZTI10PartImport = {{.*}}dllimport
+// WI-DAG: @_ZTI10PartImport = external dso_local constant ptr
+struct PartImport {
+  virtual void inlineFunc() const {}
+  virtual void key();
+  __declspec(dllimport) virtual void func();
+};
+
+/// 'PartImport::key()' is the key function, so the vtable and typeinfo symbol
+/// of 'PartImport' will be defined in the TU that contains the definition of
+/// 'key()' (and they must be exported from there). Here, we will reference the
+/// vtable and typeinfo symbol, so we must also import them.
+void PartImportTest() {
+  PartImport f;
+  typeid(PartImport).name();
+}
+
+/// Case (b) -- Export Aspect
+/// The class as a whole is not exported, but a non-inline method of the class
+/// is, so the vtable and typeinfo symbol must be exported.
+
+// PS-DAG: @_ZTV10PartExport = {{.*}}dllexport
+// WI-DAG: @_ZTV10PartExport = dso_local unnamed_addr constant {
+// PS-DAG: @_ZTI10PartExport = {{.*}}dllexport
+// WI-DAG: @_ZTI10PartExport = dso_local constant {
+struct PartExport {
+  virtual void inlineFunc() const {}
+  virtual void key();
+  __declspec(dllexport) virtual void func();
+};
+
+/// This is the key function of the class 'PartExport', so the vtable and
+/// typeinfo symbol of 'PartExport' will be defined in this TU, and so they must
+/// be exported from this TU.
+void PartExport::key() { typeid(PartExport).name(); }
+
+/// Case (b) -- Export Aspect
+/// The class as a whole is not exported, but the constructor of the class
+/// is, so the vtable and typeinfo symbol must be exported.
+
+// PS-DAG: @_ZTV10ConsExport = {{.*}}dllexport
+// WI-DAG: @_ZTV10ConsExport = dso_local unnamed_addr constant {
+// PS-DAG: @_ZTI10ConsExport = {{.*}}dllexport
+// WI-DAG: @_ZTI10ConsExport = dso_local constant {
+struct ConsExport {
+  __declspec(dllexport) ConsExport();
+  virtual void key();
+};
+
+ConsExport::ConsExport() {}
+void ConsExport::key() { typeid(ConsExport).name(); }
diff --git a/clang/test/CodeGenCXX/ps4-dllstorage-vtable-rtti.cpp b/clang/test/CodeGenCXX/ps4-dllstorage-vtable-rtti.cpp
deleted file mode 100644
index 5724e78617df..000000000000
--- a/clang/test/CodeGenCXX/ps4-dllstorage-vtable-rtti.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-// For a class that has a vtable (and hence, also has a typeinfo symbol for
-// RTTI), if a user marks either:
-//
-//  (a) the entire class as dllexport (dllimport), or
-//  (b) all non-inline virtual methods of the class as dllexport (dllimport)
-//
-// then Clang must export the vtable and typeinfo symbol from the TU where they
-// are defined (the TU containing the definition of the Itanium C++ ABI "key
-// function"), and must import them in other modules where they are referenced.
-//
-// Conversely to point (b), if some (but not all) of the non-inline virtual
-// methods of a class are marked as dllexport (dllimport), then the vtable and
-// typeinfo symbols must not be exported (imported).  This will result in a
-// link-time failure when linking the importing module.  This link-time failure
-// is the desired behavior, because the Microsoft toolchain also gets a
-// link-time failure in these cases (and since __declspec(dllexport)
-// (__declspec(dllimport)) is a Microsoft extension, our intention is to mimic
-// that Microsoft behavior).
-//
-// Side note: It is within the bodies of constructors (and in some cases,
-// destructors) that the vtable is explicitly referenced.  In case (a) above,
-// where the entire class is exported (imported), then all constructors (among
-// other things) are exported (imported).  So for that situation, an importing
-// module for a well-formed program will not actually reference the vtable,
-// since constructor calls will all be to functions external to that module
-// (and imported into it, from the exporting module).  I.e., all vtable
-// references will be in that module where the constructor and destructor
-// bodies are, therefore, there will not be a need to import the vtable in
-// that case.
-//
-// This test contains 6 test classes:
-//   2 for point (a),
-//   2 for point (b),
-//   and 2 negative tests for the converse of point (b).
-//
-// The two tests for each of these points are one for importing, and one for
-// exporting.
-
-// RUN: %clang_cc1 -I%S -fdeclspec -triple x86_64-unknown-windows-itanium -emit-llvm -o - %s -fhalf-no-semantic-interposition | FileCheck %s -check-prefix=WI
-// RUN: %clang_cc1 -I%S -fdeclspec -triple x86_64-scei-windows-itanium -emit-llvm -o - %s -fhalf-no-semantic-interposition | FileCheck %s --check-prefixes=PS4,SCEI_WI
-// RUN: %clang_cc1 -I%S -fdeclspec -triple x86_64-scei-ps4 -emit-llvm -o - %s -fhalf-no-semantic-interposition | FileCheck %s --check-prefixes=PS4,SCEI_PS4
-// RUN: %clang_cc1 -I%S -fdeclspec -triple x86_64-sie-ps5  -emit-llvm -o - %s -fhalf-no-semantic-interposition | FileCheck %s --check-prefixes=PS4,SCEI_PS4
-
-#include <typeinfo>
-
-// Case (a) -- Import Aspect
-// The entire class is imported.  The typeinfo symbol must also be imported,
-// but the vtable will not be referenced, and so does not need to be imported
-// (as described in the "Side note", above).
-//
-// PS4-DAG: @_ZTI10FullImport = {{.*}}dllimport
-// WI-DAG: @_ZTI10FullImport = external dllimport constant ptr
-struct __declspec(dllimport) FullImport
-{
-  virtual void getId() {}
-  virtual void Bump();
-  virtual void Decrement();
-};
-
-// 'FullImport::Bump()' is the key function, so the vtable and typeinfo symbol
-// of 'FullImport' will be defined in the TU that contains the definition of
-// 'Bump()' (and they must be exported from there).
-void FullImportTest()
-{
-  typeid(FullImport).name();
-}
-
-///////////////////////////////////////////////////////////////////
-
-// Case (a) -- Export Aspect
-// The entire class is exported.  The vtable and typeinfo symbols must also be
-// exported,
-//
-// PS4-DAG: @_ZTV10FullExport ={{.*}}dllexport
-// WI-DAG: @_ZTV10FullExport ={{.*}}dllexport
-// PS4-DAG: @_ZTI10FullExport ={{.*}}dllexport
-// WI-DAG: @_ZTI10FullExport = dso_local dllexport constant {
-struct __declspec(dllexport) FullExport // Easy case: Entire class is exported.
-{
-  virtual void getId() {}
-  virtual void Bump();
-  virtual void Decrement();
-};
-
-// This is the key function of the class 'FullExport', so the vtable and
-// typeinfo symbols of 'FullExport' will be defined in this TU, and so they
-// must be exported from this TU.
-void FullExport::Bump()
-{
-  typeid(FullExport).name();
-}
-
-///////////////////////////////////////////////////////////////////
-
-// Case (b) -- Import Aspect
-// The class as a whole is not imported, but all non-inline virtual methods of
-// the class are, so the vtable and typeinfo symbol must be imported.
-//
-// PS4-DAG: @_ZTV9FooImport ={{.*}}dllimport
-// WI-DAG:  @_ZTV9FooImport = linkonce_odr dso_local unnamed_addr constant {
-// PS4-DAG: @_ZTI9FooImport ={{.*}}dllimport
-// WI-DAG:  @_ZTI9FooImport = linkonce_odr dso_local constant {
-
-
-struct FooImport
-{
-  virtual void getId() const {}
-  __declspec(dllimport) virtual void Bump();
-  __declspec(dllimport) virtual void Decrement();
-};
-
-// 'FooImport::Bump()' is the key function, so the vtable and typeinfo symbol
-// of 'FooImport' will be defined in the TU that contains the definition of
-// 'Bump()' (and they must be exported from there).  Here, we will reference
-// the vtable and typeinfo symbol, so we must also import them.
-void importTest()
-{
-  typeid(FooImport).name();
-}
-
-///////////////////////////////////////////////////////////////////
-
-// Case (b) -- Export Aspect
-// The class as a whole is not exported, but all non-inline virtual methods of
-// the class are, so the vtable and typeinfo symbol must be exported.
-//
-// PS4-DAG: @_ZTV9FooExport ={{.*}}dllexport
-// WI-DAG:  @_ZTV9FooExport = dso_local unnamed_addr constant {
-// PS4-DAG: @_ZTI9FooExport ={{.*}}dllexport
-// WI-DAG:  @_ZTI9FooExport = dso_local constant {
-struct FooExport
-{
-  virtual void getId() const {}
-  __declspec(dllexport) virtual void Bump();
-  __declspec(dllexport) virtual void Decrement();
-};
-
-// This is the key function of the class 'FooExport', so the vtable and
-// typeinfo symbol of 'FooExport' will be defined in this TU, and so they must
-// be exported from this TU.
-void FooExport::Bump()
-{
-  FooImport f;
-  typeid(FooExport).name();
-}
-
-///////////////////////////////////////////////////////////////////
-
-// The tests below verify that the associated vtable and typeinfo symbols are
-// not imported/exported.  These are the converse of case (b).
-//
-// Note that ultimately, if the module doing the importing calls a constructor
-// of the class with the vtable, or makes a reference to the typeinfo symbol of
-// the class, then this will result in an unresolved reference (to the vtable
-// or typeinfo symbol) when linking the importing module, and thus a link-time
-// failure.
-//
-// Note that with the Microsoft toolchain there will also be a link-time
-// failure when linking the module doing the importing.  With the Microsoft
-// toolchain, it will be an unresolved reference to the method 'Decrement()'
-// of the approriate class, rather than to the vtable or typeinfo symbol of
-// the class, because Microsoft defines the vtable and typeinfo symbol (weakly)
-// everywhere they are used.
-
-// Converse of case (b) -- Import Aspect
-// The class as a whole is not imported, and not all non-inline virtual methods
-// are imported, so the vtable and typeinfo symbol are not to be imported.
-//
-// CHECK-PS4: @_ZTV11FooNoImport = external dso_local unnamed_addr constant {
-// CHECK-WI:  @_ZTV11FooNoImport = linkonce_odr dso_local unnamed_addr constant {
-// CHECK-PS4: @_ZTI11FooNoImport = external dso_local constant ptr{{$}}
-// CHECK-WI:  @_ZTI11FooNoImport = linkonce_odr dso_local constant {
-struct FooNoImport
-{
-  virtual void getId() const {}
-  __declspec(dllimport) virtual void Bump();
-  virtual void Decrement();     // Not imported.
-  int mCounter;
-};
-
-void importNegativeTest()
-{
-  FooNoImport f;
-  typeid(FooNoImport).name();
-}
-
-///////////////////////////////////////////////////////////////////
-
-// Converse of case (b) -- Export Aspect
-// The class as a whole is not exported, and not all non-inline virtual methods
-// are exported, so the vtable and typeinfo symbol are not to be exported.
-//
-// SCEI_PS4-DAG: @_ZTV11FooNoImport = external unnamed_addr constant {
-// SCEI_WI-DAG:  @_ZTV11FooNoExport = dso_local unnamed_addr constant {
-
-// WI-DAG:       @_ZTV11FooNoExport = dso_local unnamed_addr constant {
-// SCEI_PS4-DAG: @_ZTI11FooNoExport = constant {
-// SCEI_WI-DAG:  @_ZTI11FooNoExport = dso_local constant {
-// WI-DAG:       @_ZTI11FooNoExport = dso_local constant {
-struct FooNoExport
-{
-  virtual void getId() const {}
-  __declspec(dllexport) virtual void Bump();
-  virtual void Decrement(); // Not exported.
-  int mCounter;
-};
-
-void FooNoExport::Bump()
-{
-  typeid(FooNoExport).name();
-}
diff --git a/clang/test/CodeGenCXX/weak-external.cpp b/clang/test/CodeGenCXX/weak-external.cpp
index 5eb262cdbead..e30d4defd455 100644
--- a/clang/test/CodeGenCXX/weak-external.cpp
+++ b/clang/test/CodeGenCXX/weak-external.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -triple %itanium_abi_triple %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -triple x86_64-unknown-linux-gnu %s -emit-llvm -o - | FileCheck %s
 // PR4262
 
 // CHECK-NOT: _ZNSs12_S_constructIPKcEEPcT_S3_RKSaIcESt20forward_iterator_tag
diff --git a/clang/test/CodeGenCoroutines/coro-aligned-alloc-2.cpp b/clang/test/CodeGenCoroutines/coro-aligned-alloc-2.cpp
index 21c2e45b890f..bfa124bb4dc4 100644
--- a/clang/test/CodeGenCoroutines/coro-aligned-alloc-2.cpp
+++ b/clang/test/CodeGenCoroutines/coro-aligned-alloc-2.cpp
@@ -1,9 +1,7 @@
 // Tests that the combination of -fcoro-aligned-allocation and -fsized-deallocation works well.
 // Test the compiler will chose sized deallocation correctly.
-// This is only enabled with `-fsized-deallocation` which is off by default.
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 \
 // RUN:   -fcoro-aligned-allocation -emit-llvm %s -o - -disable-llvm-passes \
-// RUN:   -fsized-deallocation \
 // RUN:   | FileCheck %s
 
 #include "Inputs/coroutine.h"
diff --git a/clang/test/CodeGenCoroutines/coro-aligned-alloc.cpp b/clang/test/CodeGenCoroutines/coro-aligned-alloc.cpp
index 8019926b730c..156fa64f454c 100644
--- a/clang/test/CodeGenCoroutines/coro-aligned-alloc.cpp
+++ b/clang/test/CodeGenCoroutines/coro-aligned-alloc.cpp
@@ -26,8 +26,9 @@ struct task {
 // CHECK: %[[aligned_new:.+]] = call{{.*}}@_ZnwmSt11align_val_t({{.*}}%[[coro_size]],{{.*}}%[[coro_align]])
 
 // CHECK: coro.free:
+// CHECK: %[[coro_size_for_free:.+]] = call{{.*}}@llvm.coro.size
 // CHECK: %[[coro_align_for_free:.+]] = call{{.*}}@llvm.coro.align
-// CHECK: call void @_ZdlPvSt11align_val_t({{.*}}[[coro_align_for_free]]
+// CHECK: call void @_ZdlPvmSt11align_val_t({{.*}}%[[coro_size_for_free]],{{.*}}%[[coro_align_for_free]])
 
 task f() {
     co_return 43;
@@ -58,8 +59,9 @@ void *operator new(std::size_t, std::align_val_t, std::nothrow_t) noexcept;
 // CHECK: %[[aligned_new:.+]] = call{{.*}}@_ZnwmSt11align_val_tSt9nothrow_t({{.*}}%[[coro_size]],{{.*}}%[[coro_align]])
 
 // CHECK: coro.free:
+// CHECK: %[[coro_size_for_free:.+]] = call{{.*}}@llvm.coro.size
 // CHECK: %[[coro_align_for_free:.+]] = call{{.*}}@llvm.coro.align
-// CHECK: call void @_ZdlPvSt11align_val_t({{.*}}[[coro_align_for_free]]
+// CHECK: call void @_ZdlPvmSt11align_val_t({{.*}}%[[coro_size_for_free]],{{.*}}%[[coro_align_for_free]])
 
 task2 f2() {
     co_return 43;
diff --git a/clang/test/CodeGenCoroutines/coro-alloc.cpp b/clang/test/CodeGenCoroutines/coro-alloc.cpp
index d026a0d7df22..7b3be7e0b7f9 100644
--- a/clang/test/CodeGenCoroutines/coro-alloc.cpp
+++ b/clang/test/CodeGenCoroutines/coro-alloc.cpp
@@ -70,7 +70,8 @@ extern "C" void f0(global_new_delete_tag) {
   // CHECK: br i1 %[[NeedDealloc]], label %[[FreeBB:.+]], label %[[Afterwards:.+]]
 
   // CHECK: [[FreeBB]]:
-  // CHECK: call void @_ZdlPv(ptr noundef %[[MEM]])
+  // CHECK: %[[SIZE:.+]] = call i64 @llvm.coro.size.i64()
+  // CHECK: call void @_ZdlPvm(ptr noundef %[[MEM]], i64 noundef %[[SIZE]])
   // CHECK: br label %[[Afterwards]]
 
   // CHECK: [[Afterwards]]:
@@ -99,7 +100,8 @@ extern "C" void f1(promise_new_tag ) {
 
   // CHECK: %[[FRAME:.+]] = call ptr @llvm.coro.begin(
   // CHECK: %[[MEM:.+]] = call ptr @llvm.coro.free(token %[[ID]], ptr %[[FRAME]])
-  // CHECK: call void @_ZdlPv(ptr noundef %[[MEM]])
+  // CHECK: %[[SIZE:.+]] = call i64 @llvm.coro.size.i64()
+  // CHECK: call void @_ZdlPvm(ptr noundef %[[MEM]], i64 noundef %[[SIZE]])
   co_return;
 }
 
diff --git a/clang/test/CodeGenCoroutines/coro-cleanup.cpp b/clang/test/CodeGenCoroutines/coro-cleanup.cpp
index 98f150758e2d..4e77ac25af1b 100644
--- a/clang/test/CodeGenCoroutines/coro-cleanup.cpp
+++ b/clang/test/CodeGenCoroutines/coro-cleanup.cpp
@@ -84,11 +84,13 @@ void f() {
   // CHECK: [[Cleanup]]:
   // CHECK: call void @_ZNSt16coroutine_traitsIJvEE12promise_typeD1Ev(
   // CHECK: %[[Mem0:.+]] = call ptr @llvm.coro.free(
-  // CHECK: call void @_ZdlPv(ptr noundef %[[Mem0]]
+  // CHECK: %[[SIZE:.+]] = call i64 @llvm.coro.size.i64()
+  // CHECK: call void @_ZdlPvm(ptr noundef %[[Mem0]], i64 noundef %[[SIZE]])
 
   // CHECK: [[Dealloc]]:
   // THROWEND:   %[[Mem:.+]] = call ptr @llvm.coro.free(
-  // THROWEND:   call void @_ZdlPv(ptr noundef %[[Mem]])
+  // THROWEND:   %[[SIZE:.+]] = call i64 @llvm.coro.size.i64()
+  // THROWEND:   call void @_ZdlPvm(ptr noundef %[[Mem]], i64 noundef %[[SIZE]])
 
   co_return;
 }
diff --git a/clang/test/CodeGenCoroutines/coro-dealloc.cpp b/clang/test/CodeGenCoroutines/coro-dealloc.cpp
index 3cdba6cafdc0..5a699ac9b585 100644
--- a/clang/test/CodeGenCoroutines/coro-dealloc.cpp
+++ b/clang/test/CodeGenCoroutines/coro-dealloc.cpp
@@ -1,6 +1,5 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 \
 // RUN:   -emit-llvm %s -o - -disable-llvm-passes \
-// RUN:   -fsized-deallocation \
 // RUN:   | FileCheck %s
 
 #include "Inputs/coroutine.h"
@@ -21,7 +20,6 @@ struct task {
 };
 
 // Test the compiler will chose sized deallocation correctly.
-// This is only enabled with `-fsized-deallocation` which is off by default.
 void operator delete(void *ptr, std::size_t size) noexcept;
 
 // CHECK: define{{.*}}@_Z1fv
diff --git a/clang/test/CodeGenCoroutines/coro-gro.cpp b/clang/test/CodeGenCoroutines/coro-gro.cpp
index d4c3ff589e34..b62134317cef 100644
--- a/clang/test/CodeGenCoroutines/coro-gro.cpp
+++ b/clang/test/CodeGenCoroutines/coro-gro.cpp
@@ -51,7 +51,8 @@ int f() {
 
   // CHECK: call void @_ZNSt16coroutine_traitsIiJEE12promise_typeD1Ev(
   // CHECK: %[[Mem:.+]] = call ptr @llvm.coro.free(
-  // CHECK: call void @_ZdlPv(ptr noundef %[[Mem]])
+  // CHECK: %[[SIZE:.+]] = call i64 @llvm.coro.size.i64()
+  // CHECK: call void @_ZdlPvm(ptr noundef %[[Mem]], i64 noundef %[[SIZE]])
 
   // Initialize retval from Gro and destroy Gro
   // Note this also tests delaying initialization when Gro and function return
diff --git a/clang/test/CodeGenCoroutines/pr56919.cpp b/clang/test/CodeGenCoroutines/pr56919.cpp
index c7de08ef72d7..baa8c27ce664 100644
--- a/clang/test/CodeGenCoroutines/pr56919.cpp
+++ b/clang/test/CodeGenCoroutines/pr56919.cpp
@@ -111,12 +111,15 @@ Task<void> Bar() { co_await Baz(); }
 
 // CHECK: _Z3Quxv.destroy:{{.*}}
 // CHECK-NEXT: #
-// CHECK-NEXT: jmp	_ZdlPv
+// CHECK-NEXT: movl	$40, %esi
+// CHECK-NEXT: jmp	_ZdlPvm@PLT
 
 // CHECK: _Z3Bazv.destroy:{{.*}}
 // CHECK-NEXT: #
-// CHECK-NEXT: jmp	_ZdlPv
+// CHECK-NEXT: movl	$80, %esi
+// CHECK-NEXT: jmp	_ZdlPvm
 
 // CHECK: _Z3Barv.destroy:{{.*}}
 // CHECK-NEXT: #
-// CHECK-NEXT: jmp	_ZdlPv
+// CHECK-NEXT: movl	$120, %esi
+// CHECK-NEXT: jmp	_ZdlPvm
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl
new file mode 100644
index 000000000000..fc5649d8a41f
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl
@@ -0,0 +1,52 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx940 -emit-llvm -o - %s | FileCheck %s
+// REQUIRES: amdgpu-registered-target
+
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+
+// CHECK-LABEL: @test_global_load_lds_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT:    call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 4, i32 0, i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_global_load_lds_u32(global u32* src, local u32 *dst) {
+  __builtin_amdgcn_global_load_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_global_load_lds_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT:    call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 2, i32 0, i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_global_load_lds_u16(global u16* src, local u16 *dst) {
+  __builtin_amdgcn_global_load_lds(src, dst, /*size=*/2, /*offset=*/0, /*aux=*/0);
+}
+
+// CHECK-LABEL: @test_global_load_lds_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
+// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
+// CHECK-NEXT:    call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 1, i32 0, i32 0)
+// CHECK-NEXT:    ret void
+//
+void test_global_load_lds_u8(global u8* src, local u8 *dst) {
+  __builtin_amdgcn_global_load_lds(src, dst, /*size=*/1, /*offset=*/0, /*aux=*/0);
+}
diff --git a/clang/test/CoverageMapping/builtinmacro.c b/clang/test/CoverageMapping/builtinmacro.c
index abcdc191523a..5d5a176aa7d8 100644
--- a/clang/test/CoverageMapping/builtinmacro.c
+++ b/clang/test/CoverageMapping/builtinmacro.c
@@ -4,7 +4,7 @@
 
 // CHECK: filename
 const char *filename (const char *name) { // CHECK-NEXT: File 0, [[@LINE]]:41 -> [[@LINE+3]]:2 = #0
-  static const char this_file[] = __FILE__;
+  static const char this_file[] = __FILE__; // CHECK-NEXT: File 0, [[@LINE]]:35 -> [[@LINE]]:35 = #0
   return this_file;
 }
 
diff --git a/clang/test/CoverageMapping/macros.c b/clang/test/CoverageMapping/macros.c
index 6bd3be434139..fcf21170ef13 100644
--- a/clang/test/CoverageMapping/macros.c
+++ b/clang/test/CoverageMapping/macros.c
@@ -80,12 +80,14 @@ void func7(void) { // CHECK-NEXT: File 0, [[@LINE]]:18 -> [[@LINE+6]]:2 = #0
   int kk,ll;       // CHECK-NEXT: File 0, [[@LINE+1]]:7 -> [[@LINE+1]]:8 = #0
   if (k)           // CHECK-NEXT: Branch,File 0, [[@LINE]]:7 -> [[@LINE]]:8 = #1
     m(k);          // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:9 -> [[@LINE]]:5 = #1
-  else             // CHECK-NEXT: Expansion,File 0, [[@LINE-1]]:5 -> [[@LINE-1]]:6 = #0
+  else             // CHECK-NEXT: Expansion,File 0, [[@LINE-1]]:5 -> [[@LINE-1]]:6 = #1
     l = m(l);      // CHECK-NEXT: Gap,File 0, [[@LINE-2]]:7 -> [[@LINE]]:5 = (#0 - #1)
 }                  // CHECK-NEXT: File 0, [[@LINE-1]]:5 -> [[@LINE-1]]:10 = (#0 - #1)
                    // CHECK-NEXT: Expansion,File 0, [[@LINE-2]]:9 -> [[@LINE-2]]:10 = (#0 - #1)
-                   // CHECK-NEXT: File 1, [[@LINE-9]]:14 -> [[@LINE-9]]:18 = #0
-                   // CHECK-NEXT: File 2, [[@LINE-10]]:14 -> [[@LINE-10]]:15 = (#0 - #1)
+                   // CHECK-NEXT: File 1, [[@LINE-9]]:14 -> [[@LINE-9]]:17 = #1
+                   // CHECK-NEXT: File 1, [[@LINE-10]]:14 -> [[@LINE-10]]:18 = #0
+                   // CHECK-NEXT: File 2, [[@LINE-11]]:14 -> [[@LINE-11]]:17 = (#0 - #1)
+                   // CHECK-NEXT: File 2, [[@LINE-12]]:14 -> [[@LINE-12]]:15 = (#0 - #1)
 
 int main(int argc, const char *argv[]) {
   func();
diff --git a/clang/test/CoverageMapping/mcdc-scratch-space.c b/clang/test/CoverageMapping/mcdc-scratch-space.c
new file mode 100644
index 000000000000..2b5b12d9dcad
--- /dev/null
+++ b/clang/test/CoverageMapping/mcdc-scratch-space.c
@@ -0,0 +1,65 @@
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c99 -fcoverage-mcdc -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
+
+// CHECK: builtin_macro0:
+int builtin_macro0(int a) {
+  // CHECK: Decision,File 0, [[@LINE+1]]:11 -> [[@LINE+2]]:15 = M:0, C:2
+  return (__LINE__ // CHECK: Branch,File 0, [[@LINE]]:11 -> [[@LINE]]:11 = 0, 0 [1,2,0]
+          && a); //   CHECK: Branch,File 0, [[@LINE]]:14 -> [[@LINE]]:15 = #2, (#1 - #2) [2,0,0]
+}
+
+// CHECK: builtin_macro1:
+int builtin_macro1(int a) {
+  // CHECK: Decision,File 0, [[@LINE+1]]:11 -> [[@LINE+2]]:22 = M:0, C:2
+  return (a // CHECK: Branch,File 0, [[@LINE]]:11 -> [[@LINE]]:12 = (#0 - #1), #1 [1,0,2]
+          || __LINE__); // CHECK: Branch,File 0, [[@LINE]]:14 -> [[@LINE]]:14 = 0, 0 [2,0,0]
+}
+
+#define PRE(x) pre_##x
+
+// CHECK: pre0:
+int pre0(int pre_a, int b_post) {
+  // CHECK: Decision,File 0, [[@LINE+2]]:11 -> [[@LINE+3]]:20 = M:0, C:2
+  // CHECK: Expansion,File 0, [[@LINE+1]]:11 -> [[@LINE+1]]:14 = #0 (Expanded file = 1)
+  return (PRE(a)
+          && b_post);
+  // CHECK: Branch,File 0, [[@LINE-1]]:14 -> [[@LINE-1]]:20 = #2, (#1 - #2) [2,0,0]
+  // CHECK: Branch,File 1, [[@LINE-9]]:16 -> [[@LINE-9]]:22 = #1, (#0 - #1) [1,2,0]
+}
+
+#define pre_foo pre_a
+
+// CHECK: pre1:
+int pre1(int pre_a, int b_post) {
+  // CHECK: Decision,File 0, [[@LINE+3]]:11 -> [[@LINE+4]]:20 = M:0, C:2
+  // CHECK: Expansion,File 0, [[@LINE+2]]:11 -> [[@LINE+2]]:14 = #0 (Expanded file = 1)
+  // CHECK: Branch,File 0, [[@LINE+2]]:14 -> [[@LINE+2]]:20 = #2, (#1 - #2) [2,0,0]
+  return (PRE(foo)
+          && b_post);
+  // CHECK: Expansion,File 1, 17:16 -> 17:20 = #0 (Expanded file = 2)
+  // CHECK: Branch,File 2, 29:17 -> 29:22 = #1, (#0 - #1) [1,2,0]
+}
+
+#define POST(x) x##_post
+
+// CHECK: post0:
+int post0(int pre_a, int b_post) {
+  // CHECK: Decision,File 0, [[@LINE+2]]:11 -> [[@LINE+3]]:18 = M:0, C:2
+  // CHECK: Branch,File 0, [[@LINE+1]]:11 -> [[@LINE+1]]:16 = (#0 - #1), #1 [1,0,2]
+  return (pre_a
+          || POST(b));
+  // CHECK: Expansion,File 0, [[@LINE-1]]:14 -> [[@LINE-1]]:18 = #1 (Expanded file = 1)
+  // CHECK: Branch,File 1, [[@LINE-9]]:17 -> [[@LINE-9]]:20 = (#1 - #2), #2 [2,0,0]
+}
+
+#define bar_post b_post
+
+// CHECK: post1:
+int post1(int pre_a, int b_post) {
+  // CHECK: Decision,File 0, [[@LINE+3]]:11 -> [[@LINE+4]]:18 = M:0, C:2
+  // CHECK: Branch,File 0, [[@LINE+2]]:11 -> [[@LINE+2]]:16 = (#0 - #1), #1 [1,0,2]
+  // CHECK: Expansion,File 0, [[@LINE+2]]:14 -> [[@LINE+2]]:18 = 0 (Expanded file = 1)
+  return (pre_a
+          || POST(bar));
+  // CHECK: Expansion,File 1, 42:17 -> 42:18 = #1 (Expanded file = 2)
+  // CHECK: Branch,File 2, 54:18 -> 54:24 = (#1 - #2), #2 [2,0,0]
+}
diff --git a/clang/test/CoverageMapping/templates.cpp b/clang/test/CoverageMapping/templates.cpp
index 143e566a33cb..7e7f2208f114 100644
--- a/clang/test/CoverageMapping/templates.cpp
+++ b/clang/test/CoverageMapping/templates.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -mllvm -emptyline-comment-coverage=false -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name templates.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -std=c++20 -mllvm -emptyline-comment-coverage=false -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name templates.cpp %s | FileCheck %s
 
 template<typename T>
 void unused(T x) {
@@ -30,5 +30,6 @@ namespace structural_value_crash {
 
   void test() {
     tpl_fn<arr>();
+    tpl_fn<&arr[1]>();
   }
 }
diff --git a/clang/test/Driver/Ofast.c b/clang/test/Driver/Ofast.c
index 1f9fc78ec1ef..8b7f2217eca2 100644
--- a/clang/test/Driver/Ofast.c
+++ b/clang/test/Driver/Ofast.c
@@ -3,7 +3,9 @@
 // RUN: %clang -fno-fast-math -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
 // RUN: %clang -fno-strict-aliasing -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
 // RUN: %clang -fno-vectorize -Ofast -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST %s
-// RUN: %clang -Ofast -O2 -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-O2 %s
+// RUN: %clang -Ofast -O2 -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-O2 \
+// RUN:  %if target={{.*-windows-msvc.*}} %{ --check-prefix=CHECK-OFAST-O2-ALIASING-MSVC %} \
+// RUN:  %else %{ --check-prefix=CHECK-OFAST-O2-ALIASING %} %s
 // RUN: %clang -Ofast -fno-fast-math -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-NO-FAST-MATH %s
 // RUN: %clang -Ofast -fno-strict-aliasing -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-NO-STRICT-ALIASING %s
 // RUN: %clang -Ofast -fno-vectorize -### %s 2>&1 | FileCheck -check-prefix=CHECK-OFAST-NO-VECTORIZE %s
@@ -15,7 +17,8 @@
 // CHECK-OFAST: -vectorize-loops
 
 // CHECK-OFAST-O2: -cc1
-// CHECK-OFAST-O2-NOT: -relaxed-aliasing
+// CHECK-OFAST-O2-ALIASING-NOT: -relaxed-aliasing
+// CHECK-OFAST-O2-ALIASING-MSVC: -relaxed-aliasing
 // CHECK-OFAST-O2-NOT: -ffast-math
 // CHECK-OFAST-O2-NOT: -Ofast
 // CHECK-OFAST-O2: -vectorize-loops
diff --git a/clang/test/Driver/aarch64-v95a.c b/clang/test/Driver/aarch64-v95a.c
index 1037da65c8cb..62878f212762 100644
--- a/clang/test/Driver/aarch64-v95a.c
+++ b/clang/test/Driver/aarch64-v95a.c
@@ -6,7 +6,7 @@
 // RUN: %clang -target aarch64 -mlittle-endian -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
 // RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
 // RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
-// GENERICV95A: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.5a"
+// GENERICV95A: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.5a"{{.*}}  "-target-feature" "+cpa"{{.*}} "-target-feature" "+faminmax"{{.*}}  "-target-feature" "+lut"
 
 // RUN: %clang -target aarch64_be -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A-BE %s
 // RUN: %clang -target aarch64_be -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A-BE %s
@@ -14,14 +14,10 @@
 // RUN: %clang -target aarch64 -mbig-endian -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A-BE %s
 // RUN: %clang -target aarch64_be -mbig-endian -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A-BE %s
 // RUN: %clang -target aarch64_be -mbig-endian -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A-BE %s
-// GENERICV95A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.5a"
+// GENERICV95A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.5a"{{.*}}  "-target-feature" "+cpa"{{.*}} "-target-feature" "+faminmax"{{.*}}  "-target-feature" "+lut"
 
 // ===== Features supported on aarch64 =====
 
-// RUN: %clang -target aarch64 -march=armv9.5a+cpa -### -c %s 2>&1 | FileCheck -check-prefix=V95A-CPA %s
-// RUN: %clang -target aarch64 -march=armv9.5-a+cpa -### -c %s 2>&1 | FileCheck -check-prefix=V95A-CPA %s
-// V95A-CPA: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.5a"{{.*}} "-target-feature" "+cpa"
-
 // RUN: %clang -target aarch64 -march=armv9.5a+pauth-lr -### -c %s 2>&1 | FileCheck -check-prefix=V95A-PAUTHLR %s
 // RUN: %clang -target aarch64 -march=armv9.5-a+pauth-lr -### -c %s 2>&1 | FileCheck -check-prefix=V95A-PAUTHLR %s
 // V95A-PAUTHLR: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.5a"{{.*}} "-target-feature" "+pauth-lr"
diff --git a/clang/test/Driver/android-unversioned-fallback-warning.cpp b/clang/test/Driver/android-unversioned-fallback-warning.cpp
index 62a951d14eff..da666cc4d9fa 100644
--- a/clang/test/Driver/android-unversioned-fallback-warning.cpp
+++ b/clang/test/Driver/android-unversioned-fallback-warning.cpp
@@ -14,14 +14,14 @@
 // RUN: %clang --target=aarch64-none-linux-android -ccc-install-dir %t/bin \
 // RUN:     -resource-dir %t/resource -### -c %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=NO-WARNING %s
-// NO-WARNING-NOT: Using unversioned Android target directory
+// NO-WARNING-NOT: using unversioned Android target directory
 
 // RUN: %clang --target=aarch64-none-linux-android21 -ccc-install-dir %t/bin \
 // RUN:     -resource-dir %t/resource -### -c %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=ANDROID21 -DDIR=%t -DSEP=%{fs-sep} %s
-// ANDROID21-DAG: Using unversioned Android target directory [[DIR]]/bin[[SEP]]..[[SEP]]include[[SEP]]aarch64-none-linux-android
-// ANDROID21-DAG: Using unversioned Android target directory [[DIR]]/bin[[SEP]]..[[SEP]]lib[[SEP]]aarch64-none-linux-android
-// ANDROID21-DAG: Using unversioned Android target directory [[DIR]]/resource[[SEP]]lib[[SEP]]aarch64-none-linux-android
+// ANDROID21-DAG: using unversioned Android target directory [[DIR]]/bin[[SEP]]..[[SEP]]include[[SEP]]aarch64-none-linux-android
+// ANDROID21-DAG: using unversioned Android target directory [[DIR]]/bin[[SEP]]..[[SEP]]lib[[SEP]]aarch64-none-linux-android
+// ANDROID21-DAG: using unversioned Android target directory [[DIR]]/resource[[SEP]]lib[[SEP]]aarch64-none-linux-android
 
 // 23 or newer should use the versioned directory
 // RUN: %clang --target=aarch64-none-linux-android23 -ccc-install-dir %t/bin \
diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c
index 75f49deca065..733f243d3c69 100644
--- a/clang/test/Driver/cl-options.c
+++ b/clang/test/Driver/cl-options.c
@@ -740,9 +740,10 @@
 // NOCLANG-SAME: "-vectorize-slp"
 // NOCLANG-NOT: "--dependent-lib=msvcrt"
 
-// RUN: %clang_cl -O2 -MD /clang:-fno-slp-vectorize /clang:-MD /clang:-MF /clang:my_dependency_file.dep -### -- %s 2>&1 | FileCheck -check-prefix=CLANG %s
+// RUN: %clang_cl -O2 -MD /clang:-fno-slp-vectorize /clang:-MD /clang:-MF /clang:my_dependency_file.dep /c /Fo%/t/cl-options.obj -### -- %s 2>&1 | FileCheck -DPREFIX=%/t -check-prefix=CLANG %s
 // CLANG: "--dependent-lib=msvcrt"
 // CLANG-SAME: "-dependency-file" "my_dependency_file.dep"
+// CLANG-SAME: "-MT" "[[PREFIX]]/cl-options.obj"
 // CLANG-NOT: "--dependent-lib=libcmt"
 // CLANG-NOT: "-vectorize-slp"
 
diff --git a/clang/test/Driver/cl-x86-flags.c b/clang/test/Driver/cl-x86-flags.c
index 716b02f02a15..51b16f0ce354 100644
--- a/clang/test/Driver/cl-x86-flags.c
+++ b/clang/test/Driver/cl-x86-flags.c
@@ -69,10 +69,7 @@
 // RUN: %clang_cl -m32 -arch:avx2 --target=i386-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=avx2 %s
 // avx2: invalid /arch: argument
 
-// RUN: %clang_cl -m32 -arch:AVX512F --target=i386-pc-windows /c /Fo%t.obj -Xclang -verify=KNL1 -DTEST_32_ARCH_AVX512F -- %s
-// KNL1-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
-// KNL1-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
-// KNL1-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
+// RUN: %clang_cl -m32 -arch:AVX512F --target=i386-pc-windows /c /Fo%t.obj -Xclang -verify -DTEST_32_ARCH_AVX512F -- %s
 #if defined(TEST_32_ARCH_AVX512F)
 #if _M_IX86_FP != 2 || !__AVX__ || !__AVX2__ || !__AVX512F__  || __AVX512BW__
 #error fail
@@ -112,10 +109,7 @@
 // RUN: %clang_cl -m64 -arch:avx2 --target=x86_64-pc-windows -### -- 2>&1 %s | FileCheck -check-prefix=avx264 %s
 // avx264: invalid /arch: argument
 
-// RUN: %clang_cl -m64 -arch:AVX512F --target=i386-pc-windows /c /Fo%t.obj -Xclang -verify=KNL2 -DTEST_64_ARCH_AVX512F -- %s
-// KNL2-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
-// KNL2-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
-// KNL2-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
+// RUN: %clang_cl -m64 -arch:AVX512F --target=i386-pc-windows /c /Fo%t.obj -Xclang -verify -DTEST_64_ARCH_AVX512F -- %s
 #if defined(TEST_64_ARCH_AVX512F)
 #if _M_IX86_FP || !__AVX__ || !__AVX2__ || !__AVX512F__  || __AVX512BW__
 #error fail
diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c
index 472d0725a793..d69cd199ac61 100644
--- a/clang/test/Driver/clang_f_opts.c
+++ b/clang/test/Driver/clang_f_opts.c
@@ -623,3 +623,9 @@
 // RUN: %clang -### --target=aarch64-windows-msvc -fno-ms-volatile %s 2>&1 | FileCheck -check-prefix=CHECK-NO-MS-VOLATILE %s
 // CHECK-MS-VOLATILE: -fms-volatile
 // CHECK-NO-MS-VOLATILE-NOT: -fms-volatile
+
+// RUN: %clang -### --target=x86_64-pc-windows-msvc %s 2>&1 | FileCheck -check-prefix=CHECK-NO-STRICT-ALIASING %s
+// RUN: %clang -### --target=x86_64-pc-windows-msvc -fstrict-aliasing %s 2>&1 | FileCheck -check-prefix=CHECK-STRICT-ALIASING %s
+// RUN: %clang -### --target=x86_64-pc-windows-msvc -fno-strict-aliasing %s 2>&1 | FileCheck -check-prefix=CHECK-NO-STRICT-ALIASING %s
+// CHECK-STRICT-ALIASING-NOT: -relaxed-aliasing
+// CHECK-NO-STRICT-ALIASING: -relaxed-aliasing
diff --git a/clang/test/Driver/cuda-cross-compiling.c b/clang/test/Driver/cuda-cross-compiling.c
index a1719a6fbe04..203bc063a010 100644
--- a/clang/test/Driver/cuda-cross-compiling.c
+++ b/clang/test/Driver/cuda-cross-compiling.c
@@ -83,8 +83,8 @@
 // RUN: not %clang -target nvptx64-nvidia-cuda -march=generic %s -### 2>&1 \
 // RUN:   | FileCheck -check-prefix=MISSING %s
 
-// MISSING: error: Must pass in an explicit nvptx64 gpu architecture to 'ptxas'
-// MISSING: error: Must pass in an explicit nvptx64 gpu architecture to 'nvlink'
+// MISSING: error: must pass in an explicit nvptx64 gpu architecture to 'ptxas'
+// MISSING: error: must pass in an explicit nvptx64 gpu architecture to 'nvlink'
 
 // RUN: %clang -target nvptx64-nvidia-cuda -flto -c %s -### 2>&1 \
 // RUN:   | FileCheck -check-prefix=GENERIC %s
diff --git a/clang/test/Driver/dxc_dxv_path.hlsl b/clang/test/Driver/dxc_dxv_path.hlsl
index 4845de11d5b0..db2c87063ac3 100644
--- a/clang/test/Driver/dxc_dxv_path.hlsl
+++ b/clang/test/Driver/dxc_dxv_path.hlsl
@@ -1,7 +1,7 @@
 // RUN: %clang_dxc -I test -Tlib_6_3  -### %s 2>&1 | FileCheck %s
 
 // Make sure report warning.
-// CHECK:dxv not found.
+// CHECK:dxv not found
 
 // RUN: echo "dxv" > %T/dxv && chmod 754 %T/dxv && %clang_dxc --dxv-path=%T %s -Tlib_6_3 -### 2>&1 | FileCheck %s --check-prefix=DXV_PATH
 // DXV_PATH:dxv{{(.exe)?}}" "-" "-o" "-"
diff --git a/clang/test/Driver/fast-math.c b/clang/test/Driver/fast-math.c
index 274f1f22ea5e..ffd081948914 100644
--- a/clang/test/Driver/fast-math.c
+++ b/clang/test/Driver/fast-math.c
@@ -67,31 +67,31 @@
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
 //
 // Target defaults for -fmath-errno (reusing the above checks).
-// RUN: %clang -### -target i686-unknown-linux -c %s 2>&1 \
+// RUN: %clang -### --target=i686-unknown-linux -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,ERRNO %s
 // RUN: %clang -### -target i686-apple-darwin -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
-// RUN: %clang -### -target x86_64-unknown-freebsd -c %s 2>&1 \
+// RUN: %clang -### --target=x86_64-unknown-freebsd -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
-// RUN: %clang -### -target x86_64-unknown-netbsd -c %s 2>&1 \
+// RUN: %clang -### --target=x86_64-unknown-netbsd -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
-// RUN: %clang -### -target x86_64-unknown-openbsd -c %s 2>&1 \
+// RUN: %clang -### --target=x86_64-unknown-openbsd -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
 // RUN: %clang -### --target=x86_64-unknown-haiku -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
-// RUN: %clang -### -target x86_64-unknown-dragonfly -c %s 2>&1 \
+// RUN: %clang -### --target=x86_64-unknown-dragonfly -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
-// RUN: %clang -### -target x86_64-fuchsia -c %s 2>&1 \
+// RUN: %clang -### --target=x86_64-fuchsia -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
-// RUN: %clang -### -target x86_64-linux-android -c %s 2>&1 \
+// RUN: %clang -### --target=x86_64-linux-android -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
-// RUN: %clang -### -target x86_64-linux-musl -c %s 2>&1 \
+// RUN: %clang -### --target=x86_64-linux-musl -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
 // RUN: %clang -### --target=amdgcn-amd-amdhsa -nogpuinc -nogpulib -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
-// RUN: %clang -### -target amdgcn-amd-amdpal -c %s 2>&1 \
+// RUN: %clang -### --target=amdgcn-amd-amdpal -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
-// RUN: %clang -### -target amdgcn-mesa-mesa3d -c %s 2>&1   \
+// RUN: %clang -### --target=amdgcn-mesa-mesa3d -c %s 2>&1   \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
 //
 // Check that -ffast-math disables -fmath-errno, and -fno-fast-math merely
@@ -103,9 +103,9 @@
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
 // RUN: %clang -### -ffast-math -fmath-errno -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,ERRNO %s
-// RUN: %clang -### -target i686-unknown-linux -fno-fast-math -c %s 2>&1 \
+// RUN: %clang -### --target=i686-unknown-linux -fno-fast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,ERRNO %s
-// RUN: %clang -### -target i686-unknown-linux -fno-math-errno -fno-fast-math -c %s 2>&1 \
+// RUN: %clang -### --target=i686-unknown-linux -fno-math-errno -fno-fast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,ERRNO %s
 // RUN: %clang -### -target i686-apple-darwin -fno-fast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK,NO-ERRNO %s
diff --git a/clang/test/Driver/fat-archive-unbundle-ext.c b/clang/test/Driver/fat-archive-unbundle-ext.c
index e98b872f0c0c..e797acccf02b 100644
--- a/clang/test/Driver/fat-archive-unbundle-ext.c
+++ b/clang/test/Driver/fat-archive-unbundle-ext.c
@@ -2,7 +2,7 @@
 // UNSUPPORTED: target={{.*-windows.*}}, target={{.*}}-macosx{{.*}}, target={{.*-darwin.*}}, target={{.*}}-aix{{.*}}
 
 // Generate dummy fat object
-// RUN: %clang -O0 -target %itanium_abi_triple %s -c -o %t.host.o
+// RUN: %clang -O0 --target=%itanium_abi_triple %s -c -o %t.host.o
 // RUN: echo 'Content of device file' > %t.tgt.o
 // RUN: clang-offload-bundler -type=o -targets=host-%itanium_abi_triple,openmp-%itanium_abi_triple -input=%t.host.o -input=%t.tgt.o -output=%t.fat.obj
 
diff --git a/clang/test/Driver/fatal-warnings.c b/clang/test/Driver/fatal-warnings.c
index 6239b25e8917..12c239cf1208 100644
--- a/clang/test/Driver/fatal-warnings.c
+++ b/clang/test/Driver/fatal-warnings.c
@@ -1,5 +1,5 @@
-// RUN: %clang -### %s -c -o tmp.o -target i686-pc-linux-gnu -integrated-as -Wa,--fatal-warnings 2>&1 | FileCheck %s
-// RUN: not %clang %s -c -o %t.o -target i686-pc-linux-gnu -integrated-as -Wa,--fatal-warnings 2>&1 %t.log
+// RUN: %clang -### %s -c -o tmp.o --target=i686-pc-linux-gnu -integrated-as -Wa,--fatal-warnings 2>&1 | FileCheck %s
+// RUN: not %clang %s -c -o %t.o --target=i686-pc-linux-gnu -integrated-as -Wa,--fatal-warnings 2>&1 %t.log
 // FileCheck --check-prefix=CHECK-AS %s -input-file %t.log
 
 // CHECK: "-cc1" {{.*}} "-massembler-fatal-warnings"
diff --git a/clang/test/Driver/fbinutils-version.c b/clang/test/Driver/fbinutils-version.c
index 56a49ed2540f..14b44b4d9dd0 100644
--- a/clang/test/Driver/fbinutils-version.c
+++ b/clang/test/Driver/fbinutils-version.c
@@ -1,29 +1,29 @@
-// RUN: %clang -### -c -target x86_64-linux %s -fbinutils-version=none 2>&1 | FileCheck %s --check-prefix=NONE
+// RUN: %clang -### -c --target=x86_64-linux %s -fbinutils-version=none 2>&1 | FileCheck %s --check-prefix=NONE
 
 // NONE: "-fbinutils-version=none"
 
-// RUN: %clang -### -c -target aarch64-linux %s -fbinutils-version=2 2>&1 | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang -### -c --target=aarch64-linux %s -fbinutils-version=2 2>&1 | FileCheck %s --check-prefix=CHECK2
 
 // CHECK2: "-fbinutils-version=2"
 
-// RUN: %clang -### -c -target aarch64-linux %s -fbinutils-version=2.35 2>&1 | FileCheck %s --check-prefix=CHECK2_35
+// RUN: %clang -### -c --target=aarch64-linux %s -fbinutils-version=2.35 2>&1 | FileCheck %s --check-prefix=CHECK2_35
 
 // CHECK2_35: "-fbinutils-version=2.35"
 
 /// Disallow -fbinutils-version=0 because we use $major==0 to indicate the MC
 /// default in the backend.
-// RUN: not %clang -c -target x86_64-linux %s -fbinutils-version=0 2>&1 | FileCheck %s --check-prefix=ERR0
+// RUN: not %clang -c --target=x86_64-linux %s -fbinutils-version=0 2>&1 | FileCheck %s --check-prefix=ERR0
 
 // ERR0: error: invalid argument '0' to -fbinutils-version=
 
-// RUN: not %clang -c -target x86_64-linux %s -fbinutils-version=nan 2>&1 | FileCheck %s --check-prefix=ERR1
+// RUN: not %clang -c --target=x86_64-linux %s -fbinutils-version=nan 2>&1 | FileCheck %s --check-prefix=ERR1
 
 // ERR1: error: invalid argument 'nan' to -fbinutils-version=
 
-// RUN: not %clang -c -target x86_64-linux %s -fbinutils-version=2. 2>&1 | FileCheck %s --check-prefix=ERR2
+// RUN: not %clang -c --target=x86_64-linux %s -fbinutils-version=2. 2>&1 | FileCheck %s --check-prefix=ERR2
 
 // ERR2: error: invalid argument '2.' to -fbinutils-version=
 
-// RUN: not %clang -c -target x86_64-linux %s -fbinutils-version=3.-14 2>&1 | FileCheck %s --check-prefix=ERR3
+// RUN: not %clang -c --target=x86_64-linux %s -fbinutils-version=3.-14 2>&1 | FileCheck %s --check-prefix=ERR3
 
 // ERR3: error: invalid argument '3.-14' to -fbinutils-version=
diff --git a/clang/test/Driver/fdirect-access-external-data.c b/clang/test/Driver/fdirect-access-external-data.c
index a6da776e6977..4dfb700d6c45 100644
--- a/clang/test/Driver/fdirect-access-external-data.c
+++ b/clang/test/Driver/fdirect-access-external-data.c
@@ -1,13 +1,13 @@
 /// -fno-pic code defaults to -fdirect-access-external-data.
-// RUN: %clang -### -c -target x86_64 %s 2>&1 | FileCheck %s --check-prefix=DEFAULT
-// RUN: %clang -### -c -target x86_64 %s -fdirect-access-external-data 2>&1 | FileCheck %s --check-prefix=DEFAULT
-// RUN: %clang -### -c -target x86_64 %s -fdirect-access-external-data -fno-direct-access-external-data 2>&1 | FileCheck %s --check-prefix=INDIRECT
+// RUN: %clang -### -c --target=x86_64 %s 2>&1 | FileCheck %s --check-prefix=DEFAULT
+// RUN: %clang -### -c --target=x86_64 %s -fdirect-access-external-data 2>&1 | FileCheck %s --check-prefix=DEFAULT
+// RUN: %clang -### -c --target=x86_64 %s -fdirect-access-external-data -fno-direct-access-external-data 2>&1 | FileCheck %s --check-prefix=INDIRECT
 
 /// -fpie/-fpic code defaults to -fdirect-access-external-data.
-// RUN: %clang -### -c -target x86_64 %s -fpie 2>&1 | FileCheck %s --check-prefix=DEFAULT
-// RUN: %clang -### -c -target x86_64 %s -fpie -fno-direct-access-external-data -fdirect-access-external-data 2>&1 | FileCheck %s --check-prefix=DIRECT
-// RUN: %clang -### -c -target aarch64 %s -fpic 2>&1 | FileCheck %s --check-prefix=DEFAULT
-// RUN: %clang -### -c -target aarch64 %s -fpic -fdirect-access-external-data 2>&1 | FileCheck %s --check-prefix=DIRECT
+// RUN: %clang -### -c --target=x86_64 %s -fpie 2>&1 | FileCheck %s --check-prefix=DEFAULT
+// RUN: %clang -### -c --target=x86_64 %s -fpie -fno-direct-access-external-data -fdirect-access-external-data 2>&1 | FileCheck %s --check-prefix=DIRECT
+// RUN: %clang -### -c --target=aarch64 %s -fpic 2>&1 | FileCheck %s --check-prefix=DEFAULT
+// RUN: %clang -### -c --target=aarch64 %s -fpic -fdirect-access-external-data 2>&1 | FileCheck %s --check-prefix=DIRECT
 
 /// loongarch* targets default to -fno-direct-access-external-data even for -fno-pic.
 // RUN: %clang -### -c --target=loongarch64 -fno-pic %s 2>&1 | FileCheck %s --check-prefix=INDIRECT
diff --git a/clang/test/Driver/fembed-bitcode.c b/clang/test/Driver/fembed-bitcode.c
index 970500525a50..9081314d121c 100644
--- a/clang/test/Driver/fembed-bitcode.c
+++ b/clang/test/Driver/fembed-bitcode.c
@@ -1,5 +1,5 @@
 // RUN: %clang -target x86_64-apple-macosx -fembed-bitcode=all -c %s -o /dev/null -### 2>&1 \
-// RUN:     | FileCheck -check-prefix CHECK-X64 %s
+// RUN:     | FileCheck --check-prefix=CHECK-X64 %s
 
 // CHECK-X64: "-cc1"
 
@@ -7,7 +7,7 @@
 // CHECK-X64-NOT: "-fdebug-compilation-dir
 
 // RUN: %clang -target armv7-apple-ios -fembed-bitcode=all -c %s -o /dev/null -### 2>&1 \
-// RUN:    | FileCheck -check-prefix CHECK-ARM %s
+// RUN:    | FileCheck --check-prefix=CHECK-ARM %s
 
 // CHECK-ARM: "-cc1"
 
@@ -17,7 +17,7 @@
 // CHECK-ARM-NOT: "-fdebug-compilation-dir
 
 // RUN: %clang -target arm64-apple-ios -fembed-bitcode=all -c %s -o /dev/null -### 2>&1 \
-// RUN:    | FileCheck -check-prefix CHECK-AARCH64 %s
+// RUN:    | FileCheck --check-prefix=CHECK-AARCH64 %s
 
 // CHECK-AARCH64: "-cc1"
 
@@ -26,12 +26,12 @@
 // CHECK-AARCH64: "darwinpcs"
 // CHECK-AARCH64-NOT: "-fdebug-compilation-dir
 
-// RUN: %clang -target hexagon-unknown-elf -ffixed-r19 -fembed-bitcode=all -c %s -### 2>&1 \
+// RUN: %clang --target=hexagon-unknown-elf -ffixed-r19 -fembed-bitcode=all -c %s -### 2>&1 \
 // RUN:     | FileCheck --check-prefix=CHECK-HEXAGON %s
 // CHECK-HEXAGON: "-target-feature"
 // CHECK-HEXAGON: "+reserved-r19"
 //
-// RUN: %clang -target wasm32-unknown-unknown -fembed-bitcode=all -pthread -c %s -o /dev/null -### 2>&1 \
+// RUN: %clang --target=wasm32-unknown-unknown -fembed-bitcode=all -pthread -c %s -o /dev/null -### 2>&1 \
 // RUN:     | FileCheck --check-prefix=CHECK-WASM %s
 
 // CHECK-WASM: "-cc1"
diff --git a/clang/test/Driver/fexcess-precision.c b/clang/test/Driver/fexcess-precision.c
index 68579b606c9b..0aa1022f17fd 100644
--- a/clang/test/Driver/fexcess-precision.c
+++ b/clang/test/Driver/fexcess-precision.c
@@ -1,19 +1,19 @@
 // Note: %s must be preceded by --, otherwise it may be interpreted as a
 // command-line option, e.g. on Mac where %s is commonly under /Users.
 
-// RUN: %clang -### -target i386 -fexcess-precision=fast -c %s 2>&1  \
+// RUN: %clang -### --target=i386 -fexcess-precision=fast -c %s 2>&1  \
 // RUN:   | FileCheck --check-prefix=CHECK-FAST %s
-// RUN: %clang_cl -### -target i386 -fexcess-precision=fast -c -- %s 2>&1 \
+// RUN: %clang_cl -### --target=i386 -fexcess-precision=fast -c -- %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FAST %s
 
-// RUN: %clang -### -target i386 -fexcess-precision=standard -c %s 2>&1  \
+// RUN: %clang -### --target=i386 -fexcess-precision=standard -c %s 2>&1  \
 // RUN:   | FileCheck --check-prefix=CHECK-STD %s
-// RUN: %clang_cl -### -target i386 -fexcess-precision=standard -c -- %s 2>&1 \
+// RUN: %clang_cl -### --target=i386 -fexcess-precision=standard -c -- %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-STD %s
 
-// RUN: %clang -### -target i386 -fexcess-precision=16 -c %s 2>&1  \
+// RUN: %clang -### --target=i386 -fexcess-precision=16 -c %s 2>&1  \
 // RUN:   | FileCheck --check-prefix=CHECK-NONE %s
-// RUN: %clang_cl -### -target i386 -fexcess-precision=16 -c -- %s 2>&1 \
+// RUN: %clang_cl -### --target=i386 -fexcess-precision=16 -c -- %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NONE %s
 
 // RUN: not %clang -### --target=i386 -fexcess-precision=none -c %s 2>&1  \
@@ -21,19 +21,19 @@
 // RUN: not %clang_cl -### --target=i386 -fexcess-precision=none -c -- %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ERR-NONE %s
 
-// RUN: %clang -### -target x86_64 -fexcess-precision=fast -c %s 2>&1 \
+// RUN: %clang -### --target=x86_64 -fexcess-precision=fast -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FAST %s
-// RUN: %clang_cl -### -target x86_64 -fexcess-precision=fast -c -- %s 2>&1 \
+// RUN: %clang_cl -### --target=x86_64 -fexcess-precision=fast -c -- %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FAST %s
 
-// RUN: %clang -### -target x86_64 -fexcess-precision=standard -c %s 2>&1 \
+// RUN: %clang -### --target=x86_64 -fexcess-precision=standard -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-STD %s
-// RUN: %clang_cl -### -target x86_64 -fexcess-precision=standard -c \
+// RUN: %clang_cl -### --target=x86_64 -fexcess-precision=standard -c \
 // RUN: -- %s 2>&1 | FileCheck --check-prefix=CHECK-STD %s
 
-// RUN: %clang -### -target x86_64 -fexcess-precision=16 -c %s 2>&1 \
+// RUN: %clang -### --target=x86_64 -fexcess-precision=16 -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NONE %s
-// RUN: %clang_cl -### -target x86_64 -fexcess-precision=16 -c -- %s 2>&1 \
+// RUN: %clang_cl -### --target=x86_64 -fexcess-precision=16 -c -- %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NONE %s
 
 // RUN: not %clang -### --target=x86_64 -fexcess-precision=none -c %s 2>&1 \
@@ -41,14 +41,14 @@
 // RUN: not %clang_cl -### --target=x86_64 -fexcess-precision=none -c -- %s 2>&1 \
 // RUN:   | FileCheck --check-prefixes=CHECK-ERR-NONE %s
 
-// RUN: %clang -### -target aarch64 -fexcess-precision=fast -c %s 2>&1 \
+// RUN: %clang -### --target=aarch64 -fexcess-precision=fast -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK %s
-// RUN: %clang_cl -### -target aarch64 -fexcess-precision=fast -c -- %s 2>&1 \
+// RUN: %clang_cl -### --target=aarch64 -fexcess-precision=fast -c -- %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK %s
 
-// RUN: %clang -### -target aarch64 -fexcess-precision=standard -c %s 2>&1 \
+// RUN: %clang -### --target=aarch64 -fexcess-precision=standard -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK %s
-// RUN: %clang_cl -### -target aarch64 -fexcess-precision=standard -c \
+// RUN: %clang_cl -### --target=aarch64 -fexcess-precision=standard -c \
 // RUN: -- %s 2>&1 | FileCheck --check-prefix=CHECK %s
 
 // RUN: not %clang -### --target=aarch64 -fexcess-precision=16 -c %s 2>&1 \
diff --git a/clang/test/Driver/fextend-args.c b/clang/test/Driver/fextend-args.c
index 7f19f8c5ec48..0b721202a000 100644
--- a/clang/test/Driver/fextend-args.c
+++ b/clang/test/Driver/fextend-args.c
@@ -5,7 +5,7 @@
 // RUN: | FileCheck -check-prefix=CHECK-64 %s
 
 // Unsupported target
-// RUN: not %clang -target aarch64-unknown-windows-msvc -fextend-arguments=32 %s 2>&1 \
+// RUN: not %clang --target=aarch64-unknown-windows-msvc -fextend-arguments=32 %s 2>&1 \
 // RUN: | FileCheck -check-prefix=UNSUPPORTED-TARGET %s
 
 // Invalid option value
diff --git a/clang/test/Driver/fforce-dwarf-frame.c b/clang/test/Driver/fforce-dwarf-frame.c
index fb5442c56a40..c4bc2619e0ef 100644
--- a/clang/test/Driver/fforce-dwarf-frame.c
+++ b/clang/test/Driver/fforce-dwarf-frame.c
@@ -1,6 +1,6 @@
-// RUN: %clang -target arm -c -### %s -fforce-dwarf-frame 2>&1 | FileCheck --check-prefix=CHECK-ALWAYS %s
-// RUN: %clang -target arm -c -### %s -fno-force-dwarf-frame 2>&1 | FileCheck --check-prefix=CHECK-NO-ALWAYS %s
-// RUN: %clang -target arm -c -### %s 2>&1 | FileCheck --check-prefix=CHECK-NO-ALWAYS %s
+// RUN: %clang --target=arm -c -### %s -fforce-dwarf-frame 2>&1 | FileCheck --check-prefix=CHECK-ALWAYS %s
+// RUN: %clang --target=arm -c -### %s -fno-force-dwarf-frame 2>&1 | FileCheck --check-prefix=CHECK-NO-ALWAYS %s
+// RUN: %clang --target=arm -c -### %s 2>&1 | FileCheck --check-prefix=CHECK-NO-ALWAYS %s
 
 // CHECK-ALWAYS: -fforce-dwarf-frame
 // CHECK-NO-ALWAYS-NOT: -fforce-dwarf-frame
diff --git a/clang/test/Driver/fgnuc-version.c b/clang/test/Driver/fgnuc-version.c
index dea82bbaae0a..c5c8ca1c159a 100644
--- a/clang/test/Driver/fgnuc-version.c
+++ b/clang/test/Driver/fgnuc-version.c
@@ -2,25 +2,25 @@
 // Verify -fgnuc-version parsing
 //
 
-// RUN: %clang -c %s -target i686-linux -### 2>&1 | FileCheck %s -check-prefix GNUC-DEFAULT
+// RUN: %clang -c %s --target=i686-linux -### 2>&1 | FileCheck %s --check-prefix=GNUC-DEFAULT
 // GNUC-DEFAULT: "-fgnuc-version=4.2.1"
 
-// RUN: %clang -c %s -target i686-linux -fgnuc-version=100.99.99 -### 2>&1 | FileCheck %s -check-prefix GNUC-OVERRIDE
+// RUN: %clang -c %s --target=i686-linux -fgnuc-version=100.99.99 -### 2>&1 | FileCheck %s --check-prefix=GNUC-OVERRIDE
 // GNUC-OVERRIDE: "-fgnuc-version=100.99.99"
 
-// RUN: %clang -c %s -target i686-linux -fgnuc-version=0 -### 2>&1 | FileCheck %s -check-prefix GNUC-DISABLE
-// RUN: %clang -c %s -target i686-linux -fgnuc-version= -### 2>&1 | FileCheck %s -check-prefix GNUC-DISABLE
+// RUN: %clang -c %s --target=i686-linux -fgnuc-version=0 -### 2>&1 | FileCheck %s --check-prefix=GNUC-DISABLE
+// RUN: %clang -c %s --target=i686-linux -fgnuc-version= -### 2>&1 | FileCheck %s --check-prefix=GNUC-DISABLE
 // GNUC-DISABLE-NOT: "-fgnuc-version=
 
-// RUN: not %clang -c %s -target i686-linux -fgnuc-version=100.100.10 2>&1 | FileCheck %s -check-prefix GNUC-INVALID
-// RUN: not %clang -c %s -target i686-linux -fgnuc-version=100.10.100 2>&1 | FileCheck %s -check-prefix GNUC-INVALID
-// RUN: not %clang -c %s -target i686-linux -fgnuc-version=-1.0.0 2>&1 | FileCheck %s -check-prefix GNUC-INVALID
+// RUN: not %clang -c %s --target=i686-linux -fgnuc-version=100.100.10 2>&1 | FileCheck %s --check-prefix=GNUC-INVALID
+// RUN: not %clang -c %s --target=i686-linux -fgnuc-version=100.10.100 2>&1 | FileCheck %s --check-prefix=GNUC-INVALID
+// RUN: not %clang -c %s --target=i686-linux -fgnuc-version=-1.0.0 2>&1 | FileCheck %s --check-prefix=GNUC-INVALID
 // GNUC-INVALID: error: invalid value {{.*}} in '-fgnuc-version={{.*}}'
 
-// RUN: %clang -fgnuc-version=100.99.99 %s -dM -E -o - | FileCheck %s -check-prefix GNUC-LARGE
+// RUN: %clang -fgnuc-version=100.99.99 %s -dM -E -o - | FileCheck %s --check-prefix=GNUC-LARGE
 // GNUC-LARGE: #define __GNUC_MINOR__ 99
 // GNUC-LARGE: #define __GNUC_PATCHLEVEL__ 99
 // GNUC-LARGE: #define __GNUC__ 100
 
-// RUN: %clang -fgnuc-version=100.99.99 -x c++ %s -dM -E -o - | FileCheck %s -check-prefix GXX-LARGE
+// RUN: %clang -fgnuc-version=100.99.99 -x c++ %s -dM -E -o - | FileCheck %s --check-prefix=GXX-LARGE
 // GXX-LARGE: #define __GNUG__ 100
diff --git a/clang/test/Driver/flags.c b/clang/test/Driver/flags.c
index da25a5cd3335..16b760609c36 100644
--- a/clang/test/Driver/flags.c
+++ b/clang/test/Driver/flags.c
@@ -25,11 +25,11 @@
 // RUN: %clang -target armv7-apple-darwin10 -### -S -mno-implicit-float -mimplicit-float %s 2>&1 | FileCheck -check-prefix=TEST8 %s
 // TEST8-NOT: "-no-implicit-float"
 
-// RUN: %clang -target x86_64-linux-gnu -### -c -fclang-abi-compat=3.2 %s 2>&1 | FileCheck -check-prefix=TEST9 %s
+// RUN: %clang --target=x86_64-linux-gnu -### -c -fclang-abi-compat=3.2 %s 2>&1 | FileCheck -check-prefix=TEST9 %s
 // TEST9: "-fclang-abi-compat=3.2"
 //
-// RUN: %clang -target riscv32 -### -S -mno-implicit-float %s 2>&1 | FileCheck -check-prefix=TEST10 %s
+// RUN: %clang --target=riscv32 -### -S -mno-implicit-float %s 2>&1 | FileCheck -check-prefix=TEST10 %s
 // TEST10: "-no-implicit-float"
 //
-// RUN: %clang -target riscv64 -### -S -mno-implicit-float %s 2>&1 | FileCheck -check-prefix=TEST11 %s
+// RUN: %clang --target=riscv64 -### -S -mno-implicit-float %s 2>&1 | FileCheck -check-prefix=TEST11 %s
 // TEST11: "-no-implicit-float"
diff --git a/clang/test/Driver/flang/msvc-link.f90 b/clang/test/Driver/flang/msvc-link.f90
index 536da2599431..463749510eb5 100644
--- a/clang/test/Driver/flang/msvc-link.f90
+++ b/clang/test/Driver/flang/msvc-link.f90
@@ -1,4 +1,4 @@
-! RUN: %clang --driver-mode=flang -target x86_64-pc-windows-msvc -### %s -Ltest 2>&1 | FileCheck %s
+! RUN: %clang --driver-mode=flang --target=x86_64-pc-windows-msvc -### %s -Ltest 2>&1 | FileCheck %s
 !
 ! Test that user provided paths come before the Flang runtimes
 ! CHECK: "-libpath:test"
diff --git a/clang/test/Driver/fmemprof.cpp b/clang/test/Driver/fmemprof.cpp
index b00d9f2c81e2..5165c4452fd5 100644
--- a/clang/test/Driver/fmemprof.cpp
+++ b/clang/test/Driver/fmemprof.cpp
@@ -1,7 +1,7 @@
-// RUN: %clangxx -target x86_64-linux-gnu -fmemory-profile %s -### 2>&1 | FileCheck %s
-// RUN: %clangxx -target x86_64-linux-gnu -fmemory-profile=foo %s -### 2>&1 | FileCheck %s --check-prefix=DIR
-// RUN: %clangxx -target x86_64-linux-gnu -fmemory-profile -fno-memory-profile %s -### 2>&1 | FileCheck %s --check-prefix=OFF
-// RUN: %clangxx -target x86_64-linux-gnu -fmemory-profile=foo -fno-memory-profile %s -### 2>&1 | FileCheck %s --check-prefix=OFF
+// RUN: %clangxx --target=x86_64-linux-gnu -fmemory-profile %s -### 2>&1 | FileCheck %s
+// RUN: %clangxx --target=x86_64-linux-gnu -fmemory-profile=foo %s -### 2>&1 | FileCheck %s --check-prefix=DIR
+// RUN: %clangxx --target=x86_64-linux-gnu -fmemory-profile -fno-memory-profile %s -### 2>&1 | FileCheck %s --check-prefix=OFF
+// RUN: %clangxx --target=x86_64-linux-gnu -fmemory-profile=foo -fno-memory-profile %s -### 2>&1 | FileCheck %s --check-prefix=OFF
 // CHECK: "-cc1" {{.*}} "-fmemory-profile"
 // CHECK: ld{{.*}}libclang_rt.memprof{{.*}}libclang_rt.memprof_cxx
 // DIR: "-cc1" {{.*}} "-fmemory-profile=foo"
@@ -9,7 +9,7 @@
 // OFF-NOT: "-fmemory-profile"
 // OFF-NOT: libclang_rt.memprof
 
-// RUN: %clangxx -target x86_64-linux-gnu -fmemory-profile-use=foo %s -### 2>&1 | FileCheck %s --check-prefix=USE
+// RUN: %clangxx --target=x86_64-linux-gnu -fmemory-profile-use=foo %s -### 2>&1 | FileCheck %s --check-prefix=USE
 // USE: "-cc1" {{.*}} "-fmemory-profile-use=foo"
 
 // RUN: not %clangxx --target=x86_64-linux-gnu -fmemory-profile -fmemory-profile-use=foo %s -### 2>&1 | FileCheck %s --check-prefix=CONFLICTWITHMEMPROFINSTR
diff --git a/clang/test/Driver/fopenmp.c b/clang/test/Driver/fopenmp.c
index 291946923b3e..7d343eeee0f3 100644
--- a/clang/test/Driver/fopenmp.c
+++ b/clang/test/Driver/fopenmp.c
@@ -1,27 +1,27 @@
-// RUN: %clang -target x86_64-linux-gnu -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target x86_64-linux-gnu -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
-// RUN: %clang -target x86_64-linux-gnu -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
 // RUN: %clang -target x86_64-apple-darwin -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
 // RUN: %clang -target x86_64-apple-darwin -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
 // RUN: %clang -target x86_64-apple-darwin -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target x86_64-freebsd -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target x86_64-freebsd -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
-// RUN: %clang -target x86_64-freebsd -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target x86_64-netbsd -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target x86_64-netbsd -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
-// RUN: %clang -target x86_64-netbsd -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target x86_64-openbsd -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target x86_64-openbsd -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
-// RUN: %clang -target x86_64-openbsd -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target x86_64-dragonfly -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target x86_64-dragonfly -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
-// RUN: %clang -target x86_64-dragonfly -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target i386-pc-solaris2.11 -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target i386-pc-solaris2.11 -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
-// RUN: %clang -target i386-pc-solaris2.11 -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target x86_64-windows-gnu -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
-// RUN: %clang -target x86_64-windows-gnu -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
-// RUN: %clang -target x86_64-windows-gnu -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-freebsd -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-freebsd -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
+// RUN: %clang --target=x86_64-freebsd -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-netbsd -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-netbsd -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
+// RUN: %clang --target=x86_64-netbsd -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-openbsd -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-openbsd -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
+// RUN: %clang --target=x86_64-openbsd -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-dragonfly -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-dragonfly -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
+// RUN: %clang --target=x86_64-dragonfly -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=i386-pc-solaris2.11 -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=i386-pc-solaris2.11 -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
+// RUN: %clang --target=i386-pc-solaris2.11 -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-windows-gnu -fopenmp=libomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
+// RUN: %clang --target=x86_64-windows-gnu -fopenmp=libgomp -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
+// RUN: %clang --target=x86_64-windows-gnu -fopenmp=libiomp5 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
 // RUN: %clang_cl --target=x86_64-windows-msvc /clang:-fopenmp=libomp /openmp -### -- %s 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
 // RUN: %clang_cl --target=x86_64-windows-msvc /clang:-fopenmp=libgomp /openmp -### -- %s 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-NO-OPENMP
 // RUN: %clang_cl --target=x86_64-windows-msvc /clang:-fopenmp=libiomp5 /openmp -### -- %s 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMP
@@ -36,99 +36,99 @@
 // CHECK-CC1-NO-OPENMP: "-cc1"
 // CHECK-CC1-NO-OPENMP-NOT: "-fopenmp"
 //
-// RUN: %clang -target x86_64-linux-gnu -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
-// RUN: %clang -target x86_64-linux-gnu -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-RT
-// RUN: %clang -target x86_64-linux-gnu -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-RT
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
 //
-// RUN: %clang -target x86_64-linux-gnu -fopenmp=libomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
-// RUN: %clang -target x86_64-linux-gnu -fopenmp=libgomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-RT
-// RUN: %clang -target x86_64-linux-gnu -fopenmp=libiomp5 -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
-// RUN: %clang -target x86_64-linux-gnu -fopenmp=libiomp5 -static -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp=libomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp=libgomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-RT
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp=libiomp5 -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp=libiomp5 -static -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
 //
-// RUN: %clang -nostdlib -target x86_64-linux-gnu -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
-// RUN: %clang -nostdlib -target x86_64-linux-gnu -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
-// RUN: %clang -nostdlib -target x86_64-linux-gnu -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
+// RUN: %clang -nostdlib --target=x86_64-linux-gnu -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
+// RUN: %clang -nostdlib --target=x86_64-linux-gnu -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
+// RUN: %clang -nostdlib --target=x86_64-linux-gnu -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
 //
-// RUN: %clang -target x86_64-darwin -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
-// RUN: %clang -target x86_64-darwin -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
-// RUN: %clang -target x86_64-darwin -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
+// RUN: %clang --target=x86_64-darwin -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
+// RUN: %clang --target=x86_64-darwin -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
+// RUN: %clang --target=x86_64-darwin -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
 //
-// RUN: %clang -nostdlib -target x86_64-darwin -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
-// RUN: %clang -nostdlib -target x86_64-darwin -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
-// RUN: %clang -nostdlib -target x86_64-darwin -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
+// RUN: %clang -nostdlib --target=x86_64-darwin -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
+// RUN: %clang -nostdlib --target=x86_64-darwin -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
+// RUN: %clang -nostdlib --target=x86_64-darwin -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
 //
-// RUN: %clang -target x86_64-freebsd -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
-// RUN: %clang -target x86_64-freebsd -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
-// RUN: %clang -target x86_64-freebsd -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
+// RUN: %clang --target=x86_64-freebsd -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
+// RUN: %clang --target=x86_64-freebsd -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
+// RUN: %clang --target=x86_64-freebsd -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
 //
-// RUN: %clang -target x86_64-freebsd -fopenmp=libomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
-// RUN: %clang -target x86_64-freebsd -fopenmp=libgomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-NO-RT
-// RUN: %clang -target x86_64-freebsd -fopenmp=libiomp5 -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
-// RUN: %clang -target x86_64-freebsd -fopenmp=libiomp5 -static -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
+// RUN: %clang --target=x86_64-freebsd -fopenmp=libomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
+// RUN: %clang --target=x86_64-freebsd -fopenmp=libgomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-NO-RT
+// RUN: %clang --target=x86_64-freebsd -fopenmp=libiomp5 -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
+// RUN: %clang --target=x86_64-freebsd -fopenmp=libiomp5 -static -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
 //
-// RUN: %clang -nostdlib -target x86_64-freebsd -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
-// RUN: %clang -nostdlib -target x86_64-freebsd -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
-// RUN: %clang -nostdlib -target x86_64-freebsd -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
+// RUN: %clang -nostdlib --target=x86_64-freebsd -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
+// RUN: %clang -nostdlib --target=x86_64-freebsd -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
+// RUN: %clang -nostdlib --target=x86_64-freebsd -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
 //
-// RUN: %clang -target x86_64-netbsd -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
-// RUN: %clang -target x86_64-netbsd -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
-// RUN: %clang -target x86_64-netbsd -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
+// RUN: %clang --target=x86_64-netbsd -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
+// RUN: %clang --target=x86_64-netbsd -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
+// RUN: %clang --target=x86_64-netbsd -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
 //
-// RUN: %clang -target x86_64-netbsd -fopenmp=libomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
-// RUN: %clang -target x86_64-netbsd -fopenmp=libgomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-NO-RT
-// RUN: %clang -target x86_64-netbsd -fopenmp=libiomp5 -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
-// RUN: %clang -target x86_64-netbsd -fopenmp=libiomp5 -static -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
+// RUN: %clang --target=x86_64-netbsd -fopenmp=libomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
+// RUN: %clang --target=x86_64-netbsd -fopenmp=libgomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-NO-RT
+// RUN: %clang --target=x86_64-netbsd -fopenmp=libiomp5 -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
+// RUN: %clang --target=x86_64-netbsd -fopenmp=libiomp5 -static -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
 //
-// RUN: %clang -nostdlib -target x86_64-netbsd -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
-// RUN: %clang -nostdlib -target x86_64-netbsd -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
-// RUN: %clang -nostdlib -target x86_64-netbsd -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
+// RUN: %clang -nostdlib --target=x86_64-netbsd -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
+// RUN: %clang -nostdlib --target=x86_64-netbsd -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
+// RUN: %clang -nostdlib --target=x86_64-netbsd -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
 //
-// RUN: %clang -target x86_64-openbsd -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
-// RUN: %clang -target x86_64-openbsd -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
-// RUN: %clang -target x86_64-openbsd -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
+// RUN: %clang --target=x86_64-openbsd -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
+// RUN: %clang --target=x86_64-openbsd -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
+// RUN: %clang --target=x86_64-openbsd -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
 //
-// RUN: %clang -target x86_64-openbsd -fopenmp=libomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
-// RUN: %clang -target x86_64-openbsd -fopenmp=libgomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-NO-RT
-// RUN: %clang -target x86_64-openbsd -fopenmp=libiomp5 -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
-// RUN: %clang -target x86_64-openbsd -fopenmp=libiomp5 -static -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
+// RUN: %clang --target=x86_64-openbsd -fopenmp=libomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
+// RUN: %clang --target=x86_64-openbsd -fopenmp=libgomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-NO-RT
+// RUN: %clang --target=x86_64-openbsd -fopenmp=libiomp5 -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
+// RUN: %clang --target=x86_64-openbsd -fopenmp=libiomp5 -static -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
 //
-// RUN: %clang -nostdlib -target x86_64-openbsd -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
-// RUN: %clang -nostdlib -target x86_64-openbsd -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
-// RUN: %clang -nostdlib -target x86_64-openbsd -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
+// RUN: %clang -nostdlib --target=x86_64-openbsd -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
+// RUN: %clang -nostdlib --target=x86_64-openbsd -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
+// RUN: %clang -nostdlib --target=x86_64-openbsd -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
 //
-// RUN: %clang -target x86_64-dragonfly -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
-// RUN: %clang -target x86_64-dragonfly -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
-// RUN: %clang -target x86_64-dragonfly -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
+// RUN: %clang --target=x86_64-dragonfly -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
+// RUN: %clang --target=x86_64-dragonfly -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
+// RUN: %clang --target=x86_64-dragonfly -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
 //
-// RUN: %clang -target x86_64-dragonfly -fopenmp=libomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
-// RUN: %clang -target x86_64-dragonfly -fopenmp=libgomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-NO-RT
-// RUN: %clang -target x86_64-dragonfly -fopenmp=libiomp5 -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
-// RUN: %clang -target x86_64-dragonfly -fopenmp=libiomp5 -static -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
+// RUN: %clang --target=x86_64-dragonfly -fopenmp=libomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
+// RUN: %clang --target=x86_64-dragonfly -fopenmp=libgomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-NO-RT
+// RUN: %clang --target=x86_64-dragonfly -fopenmp=libiomp5 -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
+// RUN: %clang --target=x86_64-dragonfly -fopenmp=libiomp5 -static -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
 //
-// RUN: %clang -nostdlib -target x86_64-dragonfly -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
-// RUN: %clang -nostdlib -target x86_64-dragonfly -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
-// RUN: %clang -nostdlib -target x86_64-dragonfly -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
+// RUN: %clang -nostdlib --target=x86_64-dragonfly -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
+// RUN: %clang -nostdlib --target=x86_64-dragonfly -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
+// RUN: %clang -nostdlib --target=x86_64-dragonfly -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
 //
-// RUN: %clang -target i386-pc-solaris2.11 -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
-// RUN: %clang -target i386-pc-solaris2.11 -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
-// RUN: %clang -target i386-pc-solaris2.11 -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
+// RUN: %clang --target=i386-pc-solaris2.11 -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
+// RUN: %clang --target=i386-pc-solaris2.11 -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
+// RUN: %clang --target=i386-pc-solaris2.11 -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5
 //
-// RUN: %clang -target i386-pc-solaris2.11 -fopenmp=libomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
-// RUN: %clang -target i386-pc-solaris2.11 -fopenmp=libgomp -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-NO-RT
-// RUN: %clang -target i386-pc-solaris2.11 -fopenmp=libiomp5 -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
-// RUN: %clang -target i386-pc-solaris2.11 -fopenmp=libiomp5 -static -static-openmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
+// RUN: %clang --target=i386-pc-solaris2.11 -fopenmp=libomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-OMP
+// RUN: %clang --target=i386-pc-solaris2.11 -fopenmp=libgomp -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-GOMP --check-prefix=CHECK-LD-STATIC-GOMP-NO-RT
+// RUN: %clang --target=i386-pc-solaris2.11 -fopenmp=libiomp5 -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5
+// RUN: %clang --target=i386-pc-solaris2.11 -fopenmp=libiomp5 -static -static-openmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC
 //
-// RUN: %clang -nostdlib -target i386-pc-solaris2.11 -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
-// RUN: %clang -nostdlib -target i386-pc-solaris2.11 -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
-// RUN: %clang -nostdlib -target i386-pc-solaris2.11 -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
+// RUN: %clang -nostdlib --target=i386-pc-solaris2.11 -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
+// RUN: %clang -nostdlib --target=i386-pc-solaris2.11 -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
+// RUN: %clang -nostdlib --target=i386-pc-solaris2.11 -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5
 //
-// RUN: %clang -target x86_64-windows-gnu -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
-// RUN: %clang -target x86_64-windows-gnu -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
-// RUN: %clang -target x86_64-windows-gnu -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5MD
+// RUN: %clang --target=x86_64-windows-gnu -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-OMP
+// RUN: %clang --target=x86_64-windows-gnu -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-GOMP --check-prefix=CHECK-LD-GOMP-NO-RT
+// RUN: %clang --target=x86_64-windows-gnu -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-IOMP5MD
 //
-// RUN: %clang -nostdlib -target x86_64-windows-gnu -fopenmp=libomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
-// RUN: %clang -nostdlib -target x86_64-windows-gnu -fopenmp=libgomp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
-// RUN: %clang -nostdlib -target x86_64-windows-gnu -fopenmp=libiomp5 %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5MD
+// RUN: %clang -nostdlib --target=x86_64-windows-gnu -fopenmp=libomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP
+// RUN: %clang -nostdlib --target=x86_64-windows-gnu -fopenmp=libgomp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-GOMP
+// RUN: %clang -nostdlib --target=x86_64-windows-gnu -fopenmp=libiomp5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IOMP5MD
 //
 // CHECK-LD-OMP: "{{.*}}ld{{(.exe)?}}"
 // CHECK-LD-OMP: "-lomp"
@@ -172,7 +172,7 @@
 // CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC: "-{{B?}}static" {{.*}} "-liomp5"
 // CHECK-LD-STATIC-IOMP5-NO-BDYNAMIC-NOT: "-Bdynamic"
 //
-// RUN: %clang -target x86_64-linux-gnu -fopenmp=libomp -fopenmp-enable-irbuilder -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMPIRBUILDER
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp=libomp -fopenmp-enable-irbuilder -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CC1-OPENMPIRBUILDER
 //
 // CHECK-CC1-OPENMPIRBUILDER: "-cc1"
 // CHECK-CC1-OPENMPIRBUILDER-SAME: "-fopenmp"
@@ -184,14 +184,14 @@
 // test the CC1 invocation. Instead, just ensure we do eventually link *some*
 // OpenMP runtime.
 //
-// RUN: %clang -target x86_64-linux-gnu -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
-// RUN: %clang -target x86_64-darwin -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
-// RUN: %clang -target x86_64-freebsd -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
-// RUN: %clang -target x86_64-netbsd -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
-// RUN: %clang -target x86_64-openbsd -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
-// RUN: %clang -target x86_64-dragonfly -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
-// RUN: %clang -target i386-pc-solaris2.11 -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
-// RUN: %clang -target x86_64-windows-gnu -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANYMD
+// RUN: %clang --target=x86_64-linux-gnu -fopenmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
+// RUN: %clang --target=x86_64-darwin -fopenmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
+// RUN: %clang --target=x86_64-freebsd -fopenmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
+// RUN: %clang --target=x86_64-netbsd -fopenmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
+// RUN: %clang --target=x86_64-openbsd -fopenmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
+// RUN: %clang --target=x86_64-dragonfly -fopenmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
+// RUN: %clang --target=i386-pc-solaris2.11 -fopenmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
+// RUN: %clang --target=x86_64-windows-gnu -fopenmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANYMD
 //
 // CHECK-LD-ANY: "{{.*}}ld{{(.exe)?}}"
 // CHECK-LD-ANY: "-l{{(omp|gomp|iomp5)}}"
diff --git a/clang/test/Driver/fortran.f95 b/clang/test/Driver/fortran.f95
index db3ff2da17e8..275b1886b2fd 100644
--- a/clang/test/Driver/fortran.f95
+++ b/clang/test/Driver/fortran.f95
@@ -1,21 +1,21 @@
 ! Check that the clang driver can invoke gcc to compile Fortran when in
 ! --driver-mode=clang. This is legacy behaviour - see also --driver-mode=flang.
 
-! RUN: %clang -target x86_64-unknown-linux-gnu -integrated-as -c %s -### 2>&1 \
+! RUN: %clang --target=x86_64-unknown-linux-gnu -integrated-as -c %s -### 2>&1 \
 ! RUN:   | FileCheck --check-prefix=CHECK-OBJECT %s
 ! CHECK-OBJECT: gcc
 ! CHECK-OBJECT: "-c"
 ! CHECK-OBJECT: "-x" "f95"
 ! CHECK-OBJECT-NOT: "-cc1as"
 
-! RUN: %clang -target x86_64-unknown-linux-gnu -integrated-as -S %s -### 2>&1 \
+! RUN: %clang --target=x86_64-unknown-linux-gnu -integrated-as -S %s -### 2>&1 \
 ! RUN:   | FileCheck --check-prefix=CHECK-ASM %s
 ! CHECK-ASM: gcc
 ! CHECK-ASM: "-S"
 ! CHECK-ASM: "-x" "f95"
 ! CHECK-ASM-NOT: "-cc1"
 
-! RUN: %clang -Wall -target x86_64-unknown-linux-gnu -integrated-as %s -o %t -### 2>&1 | FileCheck --check-prefix=CHECK-WARN %s
+! RUN: %clang -Wall --target=x86_64-unknown-linux-gnu -integrated-as %s -### 2>&1 | FileCheck --check-prefix=CHECK-WARN %s
 ! CHECK-WARN: gcc
 ! CHECK-WARN-NOT: "-Wall"
 ! CHECK-WARN: ld
diff --git a/clang/test/Driver/fpatchable-function-entry.c b/clang/test/Driver/fpatchable-function-entry.c
index 4d0d609584c8..ab04fd39ffa1 100644
--- a/clang/test/Driver/fpatchable-function-entry.c
+++ b/clang/test/Driver/fpatchable-function-entry.c
@@ -1,23 +1,23 @@
-// RUN: %clang -target i386 %s -fpatchable-function-entry=1 -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target x86_64 %s -fpatchable-function-entry=1 -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target aarch64 %s -fpatchable-function-entry=1 -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target aarch64 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target loongarch32 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target loongarch64 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target riscv32 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target riscv64 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=i386 %s -fpatchable-function-entry=1 -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=x86_64 %s -fpatchable-function-entry=1 -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=aarch64 %s -fpatchable-function-entry=1 -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=aarch64 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=loongarch32 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=loongarch64 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=riscv32 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=riscv64 %s -fpatchable-function-entry=1,0 -c -### 2>&1 | FileCheck %s
 // CHECK: "-fpatchable-function-entry=1"
 
-// RUN: %clang -target aarch64 -fsyntax-only %s -fpatchable-function-entry=1,1 -c -### 2>&1 | FileCheck --check-prefix=11 %s
+// RUN: %clang --target=aarch64 -fsyntax-only %s -fpatchable-function-entry=1,1 -c -### 2>&1 | FileCheck --check-prefix=11 %s
 // 11: "-fpatchable-function-entry=1" "-fpatchable-function-entry-offset=1"
-// RUN: %clang -target aarch64 -fsyntax-only %s -fpatchable-function-entry=2,1 -c -### 2>&1 | FileCheck --check-prefix=21 %s
+// RUN: %clang --target=aarch64 -fsyntax-only %s -fpatchable-function-entry=2,1 -c -### 2>&1 | FileCheck --check-prefix=21 %s
 // 21: "-fpatchable-function-entry=2" "-fpatchable-function-entry-offset=1"
 
-// RUN: not %clang -target ppc64 -fsyntax-only %s -fpatchable-function-entry=1 2>&1 | FileCheck --check-prefix=TARGET %s
+// RUN: not %clang --target=ppc64 -fsyntax-only %s -fpatchable-function-entry=1 2>&1 | FileCheck --check-prefix=TARGET %s
 // TARGET: error: unsupported option '-fpatchable-function-entry=1' for target 'ppc64'
 
-// RUN: not %clang -target x86_64 -fsyntax-only %s -fpatchable-function-entry=1,0, 2>&1 | FileCheck --check-prefix=EXCESS %s
+// RUN: not %clang --target=x86_64 -fsyntax-only %s -fpatchable-function-entry=1,0, 2>&1 | FileCheck --check-prefix=EXCESS %s
 // EXCESS: error: invalid argument '1,0,' to -fpatchable-function-entry=
 
-// RUN: not %clang -target aarch64-linux -fsyntax-only %s -fxray-instrument -fpatchable-function-entry=1 2>&1 | FileCheck --check-prefix=XRAY %s
+// RUN: not %clang --target=aarch64-linux -fsyntax-only %s -fxray-instrument -fpatchable-function-entry=1 2>&1 | FileCheck --check-prefix=XRAY %s
 // XRAY: error: invalid argument '-fxray-instrument' not allowed with '-fpatchable-function-entry='
diff --git a/clang/test/Driver/frame-pointer-elim.c b/clang/test/Driver/frame-pointer-elim.c
index e1b0a468ab82..cdedcc7ae4c8 100644
--- a/clang/test/Driver/frame-pointer-elim.c
+++ b/clang/test/Driver/frame-pointer-elim.c
@@ -6,39 +6,39 @@
 // KEEP-NONE:     "-mframe-pointer=none"
 
 // On Linux x86, omit frame pointer when optimization is enabled.
-// RUN: %clang -### -target i386-linux -S -fomit-frame-pointer %s 2>&1 | \
+// RUN: %clang -### --target=i386-linux -S -fomit-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
-// RUN: %clang -### -target i386-linux -S -O1 %s 2>&1 | \
+// RUN: %clang -### --target=i386-linux -S -O1 %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 
 // -fno-omit-frame-pointer or -pg disables frame pointer omission.
-// RUN: %clang -### -target i386-linux -S %s 2>&1 | \
+// RUN: %clang -### --target=i386-linux -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target i386-linux -S -O1 -fno-omit-frame-pointer %s 2>&1 | \
+// RUN: %clang -### --target=i386-linux -S -O1 -fno-omit-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target i386-linux -S -O1 -pg %s 2>&1 | \
+// RUN: %clang -### --target=i386-linux -S -O1 -pg %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
 
 // -momit-leaf-frame-pointer omits leaf frame pointer.
 // -fno-omit-frame-pointer loses out to -momit-leaf-frame-pointer.
-// RUN: %clang -### -target i386 -S -momit-leaf-frame-pointer %s 2>&1 | \
+// RUN: %clang -### --target=i386 -S -momit-leaf-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
-// RUN: %clang -### -target i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer %s 2>&1 | \
+// RUN: %clang -### --target=i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
-// RUN: %clang -### -target i386-linux -S -O1 -momit-leaf-frame-pointer %s 2>&1 | \
+// RUN: %clang -### --target=i386-linux -S -O1 -momit-leaf-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 
 // fno-omit-frame-pointer -momit-leaf-frame-pointer can be overwritten by
 // fomit-frame-pointer later on the command without warning
-// RUN: %clang -### -target i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer -fomit-frame-pointer %s 2>&1 | \
+// RUN: %clang -### --target=i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer -fomit-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 
-// RUN: %clang -### -target i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer %s 2>&1 | \
+// RUN: %clang -### --target=i386-linux -S -O1 -fno-omit-frame-pointer -momit-leaf-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
 // Explicit or default -fomit-frame-pointer wins over -mno-omit-leaf-frame-pointer.
-// RUN: %clang -### -target i386 -S %s -fomit-frame-pointer -mno-omit-leaf-frame-pointer 2>&1 | \
+// RUN: %clang -### --target=i386 -S %s -fomit-frame-pointer -mno-omit-leaf-frame-pointer 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
-// RUN: %clang -### -target i386-linux -S %s -O1 -mno-omit-leaf-frame-pointer 2>&1 | \
+// RUN: %clang -### --target=i386-linux -S %s -O1 -mno-omit-leaf-frame-pointer 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 
 // -pg -fomit-frame-pointer => error.
@@ -48,10 +48,10 @@
 // CHECK-MIX-NO-OMIT-FP-PG-NOT: '-fomit-frame-pointer' not allowed with '-pg'
 
 // NetBSD follows the same rules as Linux.
-// RUN: %clang -### -target x86_64-unknown-netbsd -S -O1 %s 2>&1 | \
+// RUN: %clang -### --target=x86_64-unknown-netbsd -S -O1 %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 
-// RUN: %clang -### -target x86_64-unknown-netbsd -S %s 2>&1 | \
+// RUN: %clang -### --target=x86_64-unknown-netbsd -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
 
 // Darwin disables omitting the leaf frame pointer even under optimization
@@ -62,10 +62,10 @@
 // RUN: %clang -### -target i386-apple-darwin -S -O1 %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
 
-// RUN: %clang -### -target i386-darwin -S -fomit-frame-pointer %s 2>&1 | \
+// RUN: %clang -### --target=i386-darwin -S -fomit-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 
-// RUN: %clang -### -target i386-darwin -S -momit-leaf-frame-pointer %s 2>&1 | \
+// RUN: %clang -### --target=i386-darwin -S -momit-leaf-frame-pointer %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
 
 // RUN: %clang -### -target armv7s-apple-ios -fomit-frame-pointer %s 2>&1 | \
@@ -85,19 +85,19 @@
 
 // On AArch64, PS4, PS5, and VE, default to omitting the frame pointer on leaf
 // functions
-// RUN: %clang -### -target aarch64 -S %s 2>&1 | \
+// RUN: %clang -### --target=aarch64 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
-// RUN: %clang -### -target x86_64-scei-ps4 -S %s 2>&1 | \
+// RUN: %clang -### --target=x86_64-scei-ps4 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
-// RUN: %clang -### -target x86_64-scei-ps4 -S -O2 %s 2>&1 | \
+// RUN: %clang -### --target=x86_64-scei-ps4 -S -O2 %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
-// RUN: %clang -### -target x86_64-sie-ps5 -S %s 2>&1 | \
+// RUN: %clang -### --target=x86_64-sie-ps5 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
-// RUN: %clang -### -target x86_64-sie-ps5 -S -O2 %s 2>&1 | \
+// RUN: %clang -### --target=x86_64-sie-ps5 -S -O2 %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
 // RUN: %clang -### -target aarch64-apple-darwin -arch arm64_32 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
-// RUN: %clang -### -target ve-unknown-linux-gnu -S %s 2>&1 | \
+// RUN: %clang -### --target=ve-unknown-linux-gnu -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
 // RUN: %clang -### --target=aarch64-linux-android -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
@@ -106,57 +106,57 @@
 // RUN: %clang -### --target=aarch64-linux-android -S -Os %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
 
-// RUN: %clang -### -target powerpc64 -S %s 2>&1 | \
+// RUN: %clang -### --target=powerpc64 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target powerpc64 -S -O1 %s 2>&1 | \
+// RUN: %clang -### --target=powerpc64 -S -O1 %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 
 // SPARC targets omit the frame pointer when optimizations are enabled.
-// RUN: %clang -### -target sparc -S %s 2>&1 | \
+// RUN: %clang -### --target=sparc -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target sparc -S -O1 %s 2>&1 | \
+// RUN: %clang -### --target=sparc -S -O1 %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
-// RUN: %clang -### -target sparcel -S %s 2>&1 | \
+// RUN: %clang -### --target=sparcel -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target sparcel -S -O1 %s 2>&1 | \
+// RUN: %clang -### --target=sparcel -S -O1 %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
-// RUN: %clang -### -target sparc64 -S %s 2>&1 | \
+// RUN: %clang -### --target=sparc64 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target sparc64 -S -O1 %s 2>&1 | \
+// RUN: %clang -### --target=sparc64 -S -O1 %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 
 // M68k targets omit the frame pointer when optimizations are enabled.
-// RUN: %clang -### -target m68k -S %s 2>&1 | \
+// RUN: %clang -### --target=m68k -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target m68k -S -O1 %s 2>&1 | \
+// RUN: %clang -### --target=m68k -S -O1 %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 
 // For AAarch32 (A32, T32) linux targets, default omit frame pointer when
 // optimizations are enabled.
-// RUN: %clang -### -target arm-linux-gnueabihf- -marm -S %s 2>&1 | \
+// RUN: %clang -### --target=arm-linux-gnueabihf- -marm -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target arm-linux-gnueabihf- -mthumb -S %s 2>&1 | \
+// RUN: %clang -### --target=arm-linux-gnueabihf- -mthumb -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target arm-linux-gnueabihf- -marm -mbig-endian -S %s 2>&1 | \
+// RUN: %clang -### --target=arm-linux-gnueabihf- -marm -mbig-endian -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target arm-linux-gnueabihf- -mthumb -mbig-endian -S %s 2>&1 | \
+// RUN: %clang -### --target=arm-linux-gnueabihf- -mthumb -mbig-endian -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target arm-linux-gnueabihf- -marm -O1 -S %s 2>&1 | \
+// RUN: %clang -### --target=arm-linux-gnueabihf- -marm -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
-// RUN: %clang -### -target arm-linux-gnueabihf- -mthumb -O1 -S %s 2>&1 | \
+// RUN: %clang -### --target=arm-linux-gnueabihf- -mthumb -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
-// RUN: %clang -### -target arm-linux-gnueabihf- -marm -mbig-endian -O1 -S %s 2>&1 | \
+// RUN: %clang -### --target=arm-linux-gnueabihf- -marm -mbig-endian -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
-// RUN: %clang -### -target arm-linux-gnueabihf- -mthumb -mbig-endian -O1 -S %s 2>&1 | \
+// RUN: %clang -### --target=arm-linux-gnueabihf- -mthumb -mbig-endian -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NONE %s
 // For Android, keep the framepointers always.
-// RUN: %clang -### -target armv7a-linux-androideabi- -marm -O1 -S %s 2>&1 | \
+// RUN: %clang -### --target=armv7a-linux-androideabi- -marm -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target armv7a-linux-androideabi- -mthumb -O1 -S %s 2>&1 | \
+// RUN: %clang -### --target=armv7a-linux-androideabi- -mthumb -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target armv7a-linux-androideabi- -marm -mbig-endian -O1 -S %s 2>&1 | \
+// RUN: %clang -### --target=armv7a-linux-androideabi- -marm -mbig-endian -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
-// RUN: %clang -### -target armv7a-linux-androideabi- -mthumb -mbig-endian -O1 -S %s 2>&1 | \
+// RUN: %clang -### --target=armv7a-linux-androideabi- -mthumb -mbig-endian -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-ALL %s
 // RUN: %clang -### --target=riscv64-linux-android -O1 -S %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=KEEP-NON-LEAF %s
diff --git a/clang/test/Driver/freebsd-mips-as.c b/clang/test/Driver/freebsd-mips-as.c
index a053c2180e52..428644ab78a9 100644
--- a/clang/test/Driver/freebsd-mips-as.c
+++ b/clang/test/Driver/freebsd-mips-as.c
@@ -1,91 +1,91 @@
 // Check passing options to the assembler for MIPS targets.
 //
-// RUN: %clang -target mips-unknown-freebsd -### \
+// RUN: %clang --target=mips-unknown-freebsd -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS32-EB-AS %s
 // MIPS32-EB-AS: as{{(.exe)?}}" "-march" "mips2" "-mabi" "32" "-EB"
 // MIPS32-EB-AS-NOT: "-KPIC"
 //
-// RUN: %clang -target mips-unknown-freebsd -### \
+// RUN: %clang --target=mips-unknown-freebsd -### \
 // RUN:   -no-integrated-as -fPIC -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS32-EB-PIC %s
 // MIPS32-EB-PIC: as{{(.exe)?}}" "-march" "mips2" "-mabi" "32" "-EB"
 // MIPS32-EB-PIC: "-KPIC"
 //
-// RUN: %clang -target mips-unknown-freebsd -### \
+// RUN: %clang --target=mips-unknown-freebsd -### \
 // RUN:   -no-integrated-as -fpic -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS32-EB-PIC-SMALL %s
 // MIPS32-EB-PIC-SMALL: as{{(.exe)?}}" "-march" "mips2" "-mabi" "32" "-EB"
 // MIPS32-EB-PIC-SMALL: "-KPIC"
 //
-// RUN: %clang -target mips-unknown-freebsd -### \
+// RUN: %clang --target=mips-unknown-freebsd -### \
 // RUN:   -no-integrated-as -fPIE -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS32-EB-PIE %s
 // MIPS32-EB-PIE: as{{(.exe)?}}" "-march" "mips2" "-mabi" "32" "-EB"
 // MIPS32-EB-PIE: "-KPIC"
 //
-// RUN: %clang -target mips-unknown-freebsd -### \
+// RUN: %clang --target=mips-unknown-freebsd -### \
 // RUN:   -no-integrated-as -fpie -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS32-EB-PIE-SMALL %s
 // MIPS32-EB-PIE-SMALL: as{{(.exe)?}}" "-march" "mips2" "-mabi" "32" "-EB"
 // MIPS32-EB-PIE-SMALL: "-KPIC"
 //
-// RUN: %clang -target mipsel-unknown-freebsd -### \
+// RUN: %clang --target=mipsel-unknown-freebsd -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS32-DEF-EL-AS %s
 // MIPS32-DEF-EL-AS: as{{(.exe)?}}" "-march" "mips2" "-mabi" "32" "-EL"
 //
-// RUN: %clang -target mips64-unknown-freebsd -### \
+// RUN: %clang --target=mips64-unknown-freebsd -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS64-EB-AS %s
 // MIPS64-EB-AS: as{{(.exe)?}}" "-march" "mips3" "-mabi" "64" "-EB"
 //
-// RUN: %clang -target mips64el-unknown-freebsd -### \
+// RUN: %clang --target=mips64el-unknown-freebsd -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS64-DEF-EL-AS %s
 // MIPS64-DEF-EL-AS: as{{(.exe)?}}" "-march" "mips3" "-mabi" "64" "-EL"
 //
-// RUN: %clang -target mips64-unknown-freebsd -mabi=n32 -### \
+// RUN: %clang --target=mips64-unknown-freebsd -mabi=n32 -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-N32 %s
 // MIPS-N32: as{{(.exe)?}}" "-march" "mips3" "-mabi" "n32" "-EB"
 //
-// RUN: %clang -target mipsel-unknown-freebsd -mabi=32 -### \
+// RUN: %clang --target=mipsel-unknown-freebsd -mabi=32 -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS32-EL-AS %s
 // MIPS32-EL-AS: as{{(.exe)?}}" "-march" "mips2" "-mabi" "32" "-EL"
 //
-// RUN: %clang -target mips64el-unknown-freebsd -mabi=64 -### \
+// RUN: %clang --target=mips64el-unknown-freebsd -mabi=64 -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS64-EL-AS %s
 // MIPS64-EL-AS: as{{(.exe)?}}" "-march" "mips3" "-mabi" "64" "-EL"
 //
-// RUN: %clang -target mips-linux-freebsd -march=mips32r2 -### \
+// RUN: %clang --target=mips-linux-freebsd -march=mips32r2 -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-32R2 %s
 // MIPS-32R2: as{{(.exe)?}}" "-march" "mips32r2" "-mabi" "32" "-EB"
 //
-// RUN: %clang -target mips-unknown-freebsd -mips32 -### \
+// RUN: %clang --target=mips-unknown-freebsd -mips32 -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-ALIAS-32 %s
 // MIPS-ALIAS-32: as{{(.exe)?}}" "-march" "mips32" "-mabi" "32" "-EB"
 //
-// RUN: %clang -target mips-unknown-freebsd -mips32r2 -### \
+// RUN: %clang --target=mips-unknown-freebsd -mips32r2 -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-ALIAS-32R2 %s
 // MIPS-ALIAS-32R2: as{{(.exe)?}}" "-march" "mips32r2" "-mabi" "32" "-EB"
 //
-// RUN: %clang -target mips64-unknown-freebsd -mips64 -### \
+// RUN: %clang --target=mips64-unknown-freebsd -mips64 -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-ALIAS-64 %s
 // MIPS-ALIAS-64: as{{(.exe)?}}" "-march" "mips64" "-mabi" "64" "-EB"
 //
-// RUN: %clang -target mips64-unknown-freebsd -mips64r2 -### \
+// RUN: %clang --target=mips64-unknown-freebsd -mips64r2 -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-ALIAS-64R2 %s
 // MIPS-ALIAS-64R2: as{{(.exe)?}}" "-march" "mips64r2" "-mabi" "64" "-EB"
 //
-// RUN: %clang -target mips-unknown-freebsd -### \
+// RUN: %clang --target=mips-unknown-freebsd -### \
 // RUN:   -no-integrated-as -G0 -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS32-EB-AS-G0 %s
 // MIPS32-EB-AS-G0: as{{(.exe)?}}" "-march" "mips2" "-mabi" "32" "-EB" "-G0"
diff --git a/clang/test/Driver/freebsd.cpp b/clang/test/Driver/freebsd.cpp
index 6ddab9199905..dc8c98d3c3cb 100644
--- a/clang/test/Driver/freebsd.cpp
+++ b/clang/test/Driver/freebsd.cpp
@@ -1,15 +1,15 @@
-// RUN: %clangxx %s -### -o %t.o -target amd64-unknown-freebsd -stdlib=platform 2>&1 \
+// RUN: %clangxx %s -### -o %t.o --target=amd64-unknown-freebsd -stdlib=platform 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-DEFAULT %s
-// RUN: %clangxx %s -### -o %t.o -target amd64-unknown-freebsd10.0 -stdlib=platform 2>&1 \
+// RUN: %clangxx %s -### -o %t.o --target=amd64-unknown-freebsd10.0 -stdlib=platform 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-TEN %s
 // CHECK-DEFAULT: "-lc++" "-lm"
 // CHECK-TEN: "-lc++" "-lm"
 
-// RUN: %clangxx %s -### -pg -o %t.o -target amd64-unknown-freebsd -stdlib=platform 2>&1 \
+// RUN: %clangxx %s -### -pg -o %t.o --target=amd64-unknown-freebsd -stdlib=platform 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PG-DEFAULT %s
-// RUN: %clangxx %s -### -pg -o %t.o -target amd64-unknown-freebsd14.0 -stdlib=platform 2>&1 \
+// RUN: %clangxx %s -### -pg -o %t.o --target=amd64-unknown-freebsd14.0 -stdlib=platform 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PG-FOURTEEN %s
-// RUN: %clangxx %s -### -pg -o %t.o -target amd64-unknown-freebsd10.0 -stdlib=platform 2>&1 \
+// RUN: %clangxx %s -### -pg -o %t.o --target=amd64-unknown-freebsd10.0 -stdlib=platform 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PG-TEN %s
 // CHECK-PG-DEFAULT: "-lc++" "-lm"
 // CHECK-PG-FOURTEEN: "-lc++" "-lm"
diff --git a/clang/test/Driver/fsanitize-coverage.c b/clang/test/Driver/fsanitize-coverage.c
index d34ad5f6698f..c2de897f80ee 100644
--- a/clang/test/Driver/fsanitize-coverage.c
+++ b/clang/test/Driver/fsanitize-coverage.c
@@ -1,45 +1,45 @@
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=0 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-0
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge -fsanitize-coverage=0 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-0
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-0
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=0 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-0
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge -fsanitize-coverage=0 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-0
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-0
 // CHECK-SANITIZE-COVERAGE-0-NOT: fsanitize-coverage-type
 // CHECK-SANITIZE-COVERAGE-0: -fsanitize=address
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=kernel-address -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=hwaddress -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=kernel-hwaddress -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=memory -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=kernel-memory -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=leak -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=undefined -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=bounds -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=bool -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=dataflow -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=thread -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=kcfi -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target %itanium_abi_triple -fsanitize=float-divide-by-zero -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
-// RUN: %clang -target x86_64-linux-gnu                     -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=kernel-address -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=hwaddress -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=kernel-hwaddress -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=memory -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=kernel-memory -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=leak -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=bounds -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=bool -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=dataflow -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=thread -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=kcfi -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=%itanium_abi_triple -fsanitize=float-divide-by-zero -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang --target=x86_64-linux-gnu                     -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
 // CHECK-SANITIZE-COVERAGE-FUNC: fsanitize-coverage-type=1
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=bb %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-BB
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=bb %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-BB
 // CHECK-SANITIZE-COVERAGE-BB: fsanitize-coverage-type=2
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-EDGE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-EDGE
 // CHECK-SANITIZE-COVERAGE-EDGE: fsanitize-coverage-type=3
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge,indirect-calls %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC_INDIR
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge,indirect-calls %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC_INDIR
 // CHECK-SANITIZE-COVERAGE-FUNC_INDIR: fsanitize-coverage-type=3
 // CHECK-SANITIZE-COVERAGE-FUNC_INDIR: fsanitize-coverage-indirect-calls
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-1
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-1
 // CHECK-SANITIZE-COVERAGE-1: warning: argument '-fsanitize-coverage=1' is deprecated, use '-fsanitize-coverage=trace-pc-guard' instead
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_FUNC_BB_EDGE_DEPRECATED
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=bb %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_FUNC_BB_EDGE_DEPRECATED
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_FUNC_BB_EDGE_DEPRECATED
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_FUNC_BB_EDGE_DEPRECATED
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=bb %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_FUNC_BB_EDGE_DEPRECATED
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_FUNC_BB_EDGE_DEPRECATED
 // CHECK_FUNC_BB_EDGE_DEPRECATED: warning: argument '-fsanitize-coverage=[func|bb|edge]' is deprecated, use '-fsanitize-coverage=[func|bb|edge],[trace-pc-guard|trace-pc],[control-flow]' instead
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge,indirect-calls,trace-pc,trace-cmp,trace-loads,trace-stores,trace-div,trace-gep %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FEATURES
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge,indirect-calls,trace-pc,trace-cmp,trace-loads,trace-stores,trace-div,trace-gep %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FEATURES
 // CHECK-SANITIZE-COVERAGE-FEATURES: -fsanitize-coverage-type=3
 // CHECK-SANITIZE-COVERAGE-FEATURES: -fsanitize-coverage-indirect-calls
 // CHECK-SANITIZE-COVERAGE-FEATURES: -fsanitize-coverage-trace-cmp
@@ -49,7 +49,7 @@
 // CHECK-SANITIZE-COVERAGE-FEATURES: -fsanitize-coverage-trace-loads
 // CHECK-SANITIZE-COVERAGE-FEATURES: -fsanitize-coverage-trace-stores
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func,edge,indirect-calls,trace-cmp -fno-sanitize-coverage=edge,indirect-calls %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-MASK
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func,edge,indirect-calls,trace-cmp -fno-sanitize-coverage=edge,indirect-calls %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-MASK
 // CHECK-MASK: -fsanitize-coverage-type=1
 // CHECK-MASK: -fsanitize-coverage-trace-cmp
 // CHECK-MASK-NOT: -fsanitize-coverage-
@@ -60,30 +60,30 @@
 // RUN: not %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func -fsanitize-coverage=edge %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-INCOMPATIBLE
 // CHECK-INCOMPATIBLE: error: invalid argument '-fsanitize-coverage=func' not allowed with '-fsanitize-coverage=edge'
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=8bit-counters %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-8BIT
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=8bit-counters %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-8BIT
 // CHECK-8BIT: warning: argument '-fsanitize-coverage=8bit-counters' is deprecated, use '-fsanitize-coverage=trace-pc-guard' instead
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=trace-bb %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE-BB
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=trace-bb %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE-BB
 // CHECK-TRACE-BB: warning: argument '-fsanitize-coverage=trace-bb' is deprecated, use '-fsanitize-coverage=trace-pc-guard' instead
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_EDGE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=edge,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_EDGE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_EDGE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=edge,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_EDGE
 // CHECK-TRACE_PC_EDGE: -fsanitize-coverage-type=3
 // CHECK-TRACE_PC_EDGE: -fsanitize-coverage-trace-pc
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_FUNC
 // CHECK-TRACE_PC_FUNC: -fsanitize-coverage-type=1
 // CHECK-TRACE_PC_FUNC: -fsanitize-coverage-trace-pc
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_GUARD_EDGE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=edge,trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_GUARD_EDGE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_GUARD_EDGE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=edge,trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_GUARD_EDGE
 // CHECK-TRACE_PC_GUARD_EDGE: -fsanitize-coverage-type=3
 // CHECK-TRACE_PC_GUARD_EDGE: -fsanitize-coverage-trace-pc-guard
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=func,trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_GUARD_FUNC
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=func,trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_GUARD_FUNC
 // CHECK-TRACE_PC_GUARD_FUNC: -fsanitize-coverage-type=1
 // CHECK-TRACE_PC_GUARD_FUNC: -fsanitize-coverage-trace-pc-guard
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=stack-depth %s \
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=stack-depth %s \
 // RUN:     -### 2>&1 | FileCheck %s --check-prefix=CHECK-STACK-DEPTH
-// RUN: %clang -target x86_64-linux-gnu \
+// RUN: %clang --target=x86_64-linux-gnu \
 // RUN:     -fsanitize-coverage=trace-pc-guard,stack-depth %s -### 2>&1 | \
 // RUN:     FileCheck %s --check-prefix=CHECK-STACK-DEPTH-PC-GUARD
 // CHECK-STACK-DEPTH: -fsanitize-coverage-type=1
@@ -92,35 +92,35 @@
 // CHECK-STACK-DEPTH-PC-GUARD: -fsanitize-coverage-trace-pc-guard
 // CHECK-STACK-DEPTH-PC-GUARD: -fsanitize-coverage-stack-depth
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=trace-cmp,indirect-calls %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-TYPE-NECESSARY
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=trace-cmp,indirect-calls %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-TYPE-NECESSARY
 // CHECK-NO-TYPE-NECESSARY-NOT: error:
 // CHECK-NO-TYPE-NECESSARY: -fsanitize-coverage-indirect-calls
 // CHECK-NO-TYPE-NECESSARY: -fsanitize-coverage-trace-cmp
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func -fsanitize-coverage=trace-cmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-EXTEND-LEGACY
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func -fsanitize-coverage=trace-cmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-EXTEND-LEGACY
 // CHECK-EXTEND-LEGACY: -fsanitize-coverage-type=1
 // CHECK-EXTEND-LEGACY: -fsanitize-coverage-trace-cmp
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=no-prune,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_NOPRUNE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=no-prune,func,trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_NOPRUNE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=no-prune,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_NOPRUNE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=no-prune,func,trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_NOPRUNE
 // CHECK_NOPRUNE: -fsanitize-coverage-no-prune
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=inline-8bit-counters %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_INLINE8BIT
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=bb,inline-8bit-counters %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_INLINE8BIT
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=inline-8bit-counters %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_INLINE8BIT
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=bb,inline-8bit-counters %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_INLINE8BIT
 // CHECK_INLINE8BIT-NOT: warning:
 // CHECK_INLINE8BIT: -fsanitize-coverage-inline-8bit-counters
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=inline-8bit-counters,pc-table %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_PC_TABLE_FOR_INLINE8BIT
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=trace-pc-guard,pc-table %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_PC_TABLE_FOR_INLINE8BIT
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=inline-8bit-counters,pc-table %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_PC_TABLE_FOR_INLINE8BIT
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=trace-pc-guard,pc-table %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_PC_TABLE_FOR_INLINE8BIT
 // CHECK_PC_TABLE_FOR_INLINE8BIT: -fsanitize-coverage-pc-table
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=inline-bool-flag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_INLINE_BOOL_FLAG
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=bb,inline-bool-flag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_INLINE_BOOL_FLAG
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=inline-bool-flag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_INLINE_BOOL_FLAG
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=bb,inline-bool-flag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_INLINE_BOOL_FLAG
 // CHECK_INLINE_BOOL_FLAG-NOT: warning:
 // CHECK_INLINE_BOOL_FLAG: -fsanitize-coverage-inline-bool-flag
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=inline-bool-flag,pc-table %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_PC_TABLE_FOR_INLINEBOOL
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=trace-pc-guard,pc-table %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_PC_TABLE_FOR_INLINEBOOL
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=inline-bool-flag,pc-table %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_PC_TABLE_FOR_INLINEBOOL
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-coverage=trace-pc-guard,pc-table %s -### 2>&1 | FileCheck %s --check-prefix=CHECK_PC_TABLE_FOR_INLINEBOOL
 // CHECK_PC_TABLE_FOR_INLINEBOOL: -fsanitize-coverage-pc-table
 
 // RUN: %clang_cl --target=i386-pc-win32 -fsanitize=address -fsanitize-coverage=func,trace-pc-guard -c -### -- %s 2>&1 | FileCheck %s -check-prefix=CLANG-CL-COVERAGE
@@ -131,11 +131,11 @@
 // CLANG-CL-COVERAGE: -fsanitize-coverage-type=1
 // CLANG-CL-COVERAGE: -fsanitize=address
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=safe-stack -fsanitize-coverage=trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-VS-SAFESTACK
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=safe-stack -fsanitize-coverage=trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-VS-SAFESTACK
 // CHECK-VS-SAFESTACK: -fsanitize-coverage-trace-pc-guard
 // CHECK-VS-SAFESTACK: -fsanitize=safe-stack
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=safe-stack -fsanitize-coverage=trace-pc-guard -fno-sanitize=safe-stack %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-SAFESTACK
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=safe-stack -fsanitize-coverage=trace-pc-guard -fno-sanitize=safe-stack %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-SAFESTACK
 // CHECK-NO-SAFESTACK-NOT: error:
 // CHECK-NO-SAFESTACK-NOT: warning:
 // CHECK-NO-SAFESTACK-NOT: argument unused
@@ -143,11 +143,11 @@
 // CHECK-NO-SAFESTACK-NOT: -fsanitize=safe-stack
 // CHECK-NO-SAFESTACK: -fsanitize-coverage-trace-pc-guard
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=shadow-call-stack -fsanitize-coverage=trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-VS-SHADOWCALLSTACK
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=shadow-call-stack -fsanitize-coverage=trace-pc-guard %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-VS-SHADOWCALLSTACK
 // CHECK-VS-SHADOWCALLSTACK: -fsanitize-coverage-trace-pc-guard
 // CHECK-VS-SHADOWCALLSTACK: -fsanitize=shadow-call-stack
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=shadow-call-stack -fsanitize-coverage=trace-pc-guard -fno-sanitize=shadow-call-stack %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-SAFESTACK
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=shadow-call-stack -fsanitize-coverage=trace-pc-guard -fno-sanitize=shadow-call-stack %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-SAFESTACK
 // CHECK-NO-SHADOWCALLSTACK-NOT: error:
 // CHECK-NO-SHADOWCALLSTACK-NOT: warning:
 // CHECK-NO-SHADOWCALLSTACK-NOT: argument unused
diff --git a/clang/test/Driver/fsanitize-ignorelist.c b/clang/test/Driver/fsanitize-ignorelist.c
index c4669e50bb09..7dd666a45319 100644
--- a/clang/test/Driver/fsanitize-ignorelist.c
+++ b/clang/test/Driver/fsanitize-ignorelist.c
@@ -11,37 +11,37 @@
 // RUN: echo "fun:bar" > %t.second
 // RUN: echo "badline" > %t.bad
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-ignorelist=%t.good -fsanitize-ignorelist=%t.second %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-IGNORELIST
-// RUN: %clang -target aarch64-linux-gnu -fsanitize=hwaddress -fsanitize-ignorelist=%t.good -fsanitize-ignorelist=%t.second %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-IGNORELIST
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-ignorelist=%t.good -fsanitize-ignorelist=%t.second %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-IGNORELIST
+// RUN: %clang --target=aarch64-linux-gnu -fsanitize=hwaddress -fsanitize-ignorelist=%t.good -fsanitize-ignorelist=%t.second %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-IGNORELIST
 // CHECK-IGNORELIST: -fsanitize-ignorelist={{.*}}.good" "-fsanitize-ignorelist={{.*}}.second
 
 // Check that the default ignorelist is not added as an extra dependency.
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-IGNORELIST-ASAN --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-IGNORELIST-ASAN --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
 // CHECK-DEFAULT-IGNORELIST-ASAN: -fsanitize-system-ignorelist={{.*[^w]}}asan_ignorelist.txt
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=hwaddress -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-IGNORELIST-HWASAN --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=hwaddress -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-IGNORELIST-HWASAN --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
 // CHECK-DEFAULT-IGNORELIST-HWASAN: -fsanitize-system-ignorelist={{.*}}hwasan_ignorelist.txt
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=integer -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=nullability -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=undefined -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=alignment -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
-// RUN: %clang -target %itanium_abi_triple -fsanitize=float-divide-by-zero -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=integer -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=nullability -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=alignment -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
+// RUN: %clang --target=%itanium_abi_triple -fsanitize=float-divide-by-zero -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
 // CHECK-DEFAULT-UBSAN-IGNORELIST: -fsanitize-system-ignorelist={{.*}}ubsan_ignorelist.txt
 
 // Check that combining ubsan and another sanitizer results in both ignorelists being used.
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=undefined,address -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --check-prefix=CHECK-DEFAULT-IGNORELIST-ASAN --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined,address -resource-dir=%S/Inputs/resource_dir %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT-UBSAN-IGNORELIST --check-prefix=CHECK-DEFAULT-IGNORELIST-ASAN --implicit-check-not=fdepfile-entry --implicit-check-not=-fsanitize-ignorelist=
 
 // Ignore -fsanitize-ignorelist flag if there is no -fsanitize flag.
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-ignorelist=%t.good %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-SANITIZE --check-prefix=DELIMITERS
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-ignorelist=%t.good %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-SANITIZE --check-prefix=DELIMITERS
 // CHECK-NO-SANITIZE-NOT: -fsanitize-ignorelist
 
 // Ignore -fsanitize-ignorelist flag if there is no -fsanitize flag.
 // Now, check for the absence of -fdepfile-entry flags.
-// RUN: %clang -target x86_64-linux-gnu -fsanitize-ignorelist=%t.good %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-SANITIZE2 --check-prefix=DELIMITERS
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize-ignorelist=%t.good %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-SANITIZE2 --check-prefix=DELIMITERS
 // CHECK-NO-SANITIZE2-NOT: -fdepfile-entry
 
 // Flag -fno-sanitize-ignorelist wins if it is specified later.
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-ignorelist=%t.good -fno-sanitize-ignorelist %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IGNORELIST --check-prefix=DELIMITERS
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-ignorelist=%t.good -fno-sanitize-ignorelist %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-IGNORELIST --check-prefix=DELIMITERS
 // CHECK-NO-IGNORELIST-NOT: -fsanitize-ignorelist
 
 // Driver barks on unexisting ignorelist files.
@@ -53,13 +53,13 @@
 // CHECK-BAD-IGNORELIST: error: malformed sanitizer ignorelist: 'error parsing file '{{.*}}.bad': malformed line 1: 'badline''
 
 // -fno-sanitize-ignorelist disables all ignorelists specified earlier.
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-ignorelist=%t.good -fno-sanitize-ignorelist -fsanitize-ignorelist=%t.second %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ONLY-FIRST-DISABLED --implicit-check-not=-fsanitize-ignorelist=
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fsanitize-ignorelist=%t.good -fno-sanitize-ignorelist -fsanitize-ignorelist=%t.second %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ONLY-FIRST-DISABLED --implicit-check-not=-fsanitize-ignorelist=
 // CHECK-ONLY_FIRST-DISABLED-NOT: good
 // CHECK-ONLY-FIRST-DISABLED: -fsanitize-ignorelist={{.*}}.second
 // CHECK-ONLY_FIRST-DISABLED-NOT: good
 
 // -fno-sanitize-ignorelist disables the system ignorelists.
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fno-sanitize-ignorelist %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DISABLED-SYSTEM --check-prefix=DELIMITERS
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=address -fno-sanitize-ignorelist %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-DISABLED-SYSTEM --check-prefix=DELIMITERS
 // CHECK-DISABLED-SYSTEM-NOT: -fsanitize-system-ignorelist
 
 // If cfi_ignorelist.txt cannot be found in the resource dir, driver should fail.
@@ -67,7 +67,7 @@
 // CHECK-MISSING-CFI-IGNORELIST: error: missing sanitizer ignorelist: '{{.*}}cfi_ignorelist.txt'
 
 // -fno-sanitize-ignorelist disables checking for cfi_ignorelist.txt in the resource dir.
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=cfi -flto -fvisibility=default -fno-sanitize-ignorelist -resource-dir=/dev/null %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-MISSING-CFI-NO-IGNORELIST
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=cfi -flto -fvisibility=default -fno-sanitize-ignorelist -resource-dir=/dev/null %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-MISSING-CFI-NO-IGNORELIST
 // CHECK-MISSING-CFI-NO-IGNORELIST-NOT: error: no such file or directory: '{{.*}}cfi_ignorelist.txt'
 
 // DELIMITERS: {{^ *"}}
diff --git a/clang/test/Driver/fsanitize-memory-param-retval.c b/clang/test/Driver/fsanitize-memory-param-retval.c
index 79ade32178b6..99d8cb7f55e5 100644
--- a/clang/test/Driver/fsanitize-memory-param-retval.c
+++ b/clang/test/Driver/fsanitize-memory-param-retval.c
@@ -1,14 +1,14 @@
-// RUN: %clang -target i386-gnu-linux %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target x86_64-linux-gnu %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target aarch64-linux-gnu %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target riscv32-linux-gnu %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target riscv64-linux-gnu %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
-// RUN: %clang -target x86_64-linux-gnu %s -fsanitize=kernel-memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=i386-gnu-linux %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=x86_64-linux-gnu %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=aarch64-linux-gnu %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=riscv32-linux-gnu %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=riscv64-linux-gnu %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
+// RUN: %clang --target=x86_64-linux-gnu %s -fsanitize=kernel-memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck %s
 
 // CHECK: "-fno-sanitize-memory-param-retval"
 
-// RUN: %clang -target aarch64-linux-gnu -fsyntax-only %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck --check-prefix=11 %s
+// RUN: %clang --target=aarch64-linux-gnu -fsyntax-only %s -fsanitize=memory -fno-sanitize-memory-param-retval -c -### 2>&1 | FileCheck --check-prefix=11 %s
 // 11: "-fno-sanitize-memory-param-retval"
 
-// RUN: not %clang -target x86_64-linux-gnu -fsyntax-only %s -fsanitize=memory -fno-sanitize-memory-param-retval=1 2>&1 | FileCheck --check-prefix=EXCESS %s
+// RUN: not %clang --target=x86_64-linux-gnu -fsyntax-only %s -fsanitize=memory -fno-sanitize-memory-param-retval=1 2>&1 | FileCheck --check-prefix=EXCESS %s
 // EXCESS: error: unknown argument: '-fno-sanitize-memory-param-retval=
diff --git a/clang/test/Driver/fsanitize-metadata-ignorelist.c b/clang/test/Driver/fsanitize-metadata-ignorelist.c
index 65a45ccb1404..ad5f4be16768 100644
--- a/clang/test/Driver/fsanitize-metadata-ignorelist.c
+++ b/clang/test/Driver/fsanitize-metadata-ignorelist.c
@@ -3,12 +3,12 @@
 // RUN: echo "fun:foo" > %t.1
 // RUN: echo "fun:bar" > %t.2
 
-// RUN: %clang -target x86_64-linux-gnu -fexperimental-sanitize-metadata=all -fexperimental-sanitize-metadata-ignorelist=%t.1 -fexperimental-sanitize-metadata-ignorelist=%t.2 %s -### 2>&1 | FileCheck %s
-// RUN: %clang -target aarch64-linux-gnu -fexperimental-sanitize-metadata=atomics -fexperimental-sanitize-metadata-ignorelist=%t.1 -fexperimental-sanitize-metadata-ignorelist=%t.2 %s -### 2>&1 | FileCheck %s
+// RUN: %clang --target=x86_64-linux-gnu -fexperimental-sanitize-metadata=all -fexperimental-sanitize-metadata-ignorelist=%t.1 -fexperimental-sanitize-metadata-ignorelist=%t.2 %s -### 2>&1 | FileCheck %s
+// RUN: %clang --target=aarch64-linux-gnu -fexperimental-sanitize-metadata=atomics -fexperimental-sanitize-metadata-ignorelist=%t.1 -fexperimental-sanitize-metadata-ignorelist=%t.2 %s -### 2>&1 | FileCheck %s
 // CHECK: "-fexperimental-sanitize-metadata-ignorelist={{.*}}.1" "-fexperimental-sanitize-metadata-ignorelist={{.*}}.2"
 
 // Verify -fsanitize-metadata-ignorelist flag not passed if there is no -fsanitize-metadata flag.
-// RUN: %clang -target x86_64-linux-gnu -fexperimental-sanitize-metadata-ignorelist=%t.1 -fexperimental-sanitize-metadata-ignorelist=%t.2 %s -### 2>&1 | FileCheck %s --check-prefix=NOSANMD
-// RUN: %clang -target aarch64-linux-gnu -fexperimental-sanitize-metadata-ignorelist=%t.1 -fexperimental-sanitize-metadata-ignorelist=%t.2 %s -### 2>&1 | FileCheck %s --check-prefix=NOSANMD
+// RUN: %clang --target=x86_64-linux-gnu -fexperimental-sanitize-metadata-ignorelist=%t.1 -fexperimental-sanitize-metadata-ignorelist=%t.2 %s -### 2>&1 | FileCheck %s --check-prefix=NOSANMD
+// RUN: %clang --target=aarch64-linux-gnu -fexperimental-sanitize-metadata-ignorelist=%t.1 -fexperimental-sanitize-metadata-ignorelist=%t.2 %s -### 2>&1 | FileCheck %s --check-prefix=NOSANMD
 // NOSANMD: warning: argument unused during compilation: '-fexperimental-sanitize-metadata-ignorelist
 // NOSANMD-NOT: "-fexperimental-sanitize-metadata-ignorelist
diff --git a/clang/test/Driver/fsanitize-object-size.c b/clang/test/Driver/fsanitize-object-size.c
index 50c67838df39..78c720288641 100644
--- a/clang/test/Driver/fsanitize-object-size.c
+++ b/clang/test/Driver/fsanitize-object-size.c
@@ -1,27 +1,27 @@
 // Check that the object size check is disabled at -O0.
 //
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=object-size %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=object-size %s -O0 -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=null,object-size %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -Werror -fsanitize=null,object-size %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=undefined %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OSIZE-NO-WARNING
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=object-size %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=object-size %s -O0 -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=null,object-size %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -Werror -fsanitize=null,object-size %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OSIZE-NO-WARNING
 
 // Check that the object size check is enabled at other optimization levels.
 //
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=undefined -O1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=object-size -O2 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=object-size -O3 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=object-size -O4 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=object-size -Ofast %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=object-size -Os %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=object-size -Oz %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=object-size -Og %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -O1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=object-size -O2 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=object-size -O3 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=object-size -O4 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=object-size -Ofast %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=object-size -Os %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=object-size -Oz %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=object-size -Og %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
 
 // Use of trap mode shouldn't affect the object size check.
 //
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -O1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=undefined-trap -O1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=undefined-trap -fsanitize-undefined-trap-on-error -O1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-trap=undefined -O1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined-trap -O1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined-trap -fsanitize-undefined-trap-on-error -O1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-HAS-OSIZE
 
 // CHECK-HAS-OSIZE-NOT: warning: the object size sanitizer
 // CHECK-HAS-OSIZE: -fsanitize={{[^ ]*}}object-size
diff --git a/clang/test/Driver/fsemantic-interposition.c b/clang/test/Driver/fsemantic-interposition.c
index 0ee0dbb3be34..aaa44878483c 100644
--- a/clang/test/Driver/fsemantic-interposition.c
+++ b/clang/test/Driver/fsemantic-interposition.c
@@ -1,20 +1,20 @@
-// RUN: %clang --sysroot=%S/Inputs -target x86_64 %s -Werror -fpic -fsemantic-interposition -c -### 2>&1 | FileCheck %s
-// RUN: %clang --sysroot=%S/Inputs -target x86_64 %s -Werror -fPIC -fsemantic-interposition -c -### 2>&1 | FileCheck %s
+// RUN: %clang --sysroot=%S/Inputs --target=x86_64 %s -Werror -fpic -fsemantic-interposition -c -### 2>&1 | FileCheck %s
+// RUN: %clang --sysroot=%S/Inputs --target=x86_64 %s -Werror -fPIC -fsemantic-interposition -c -### 2>&1 | FileCheck %s
 // CHECK: "-fsemantic-interposition"
 
 /// No-op for -fno-pic/-fpie.
-// RUN: %clang --sysroot=%S/Inputs -target x86_64 %s -Werror -fsemantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NOOP %s
-// RUN: %clang --sysroot=%S/Inputs -target x86_64 %s -Werror -fPIE -fsemantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NOOP %s
+// RUN: %clang --sysroot=%S/Inputs --target=x86_64 %s -Werror -fsemantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NOOP %s
+// RUN: %clang --sysroot=%S/Inputs --target=x86_64 %s -Werror -fPIE -fsemantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NOOP %s
 // NOOP-NOT: "-fsemantic-interposition"
 // NOOP-NOT: "-fno-semantic-interposition"
 
 /// If -fno-semantic-interposition is specified and the target supports local
 /// aliases, neither CC1 option is set.
-// RUN: %clang --sysroot=%S/Inputs -target aarch64 %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
-// RUN: %clang --sysroot=%S/Inputs -target riscv32 %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
-// RUN: %clang --sysroot=%S/Inputs -target riscv64 %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
-// RUN: %clang --sysroot=%S/Inputs -target i386 %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
-// RUN: %clang --sysroot=%S/Inputs -target x86_64 %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
+// RUN: %clang --sysroot=%S/Inputs --target=aarch64 %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
+// RUN: %clang --sysroot=%S/Inputs --target=riscv32 %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
+// RUN: %clang --sysroot=%S/Inputs --target=riscv64 %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
+// RUN: %clang --sysroot=%S/Inputs --target=i386 %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
+// RUN: %clang --sysroot=%S/Inputs --target=x86_64 %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=NO %s
 // NO-NOT: "-fsemantic-interposition"
 // NO-NOT: "-fhalf-no-semantic-interposition"
 
@@ -23,8 +23,8 @@
 /// local aliases, use the traditional half-baked behavor: interprocedural
 /// optimizations are allowed but local aliases are not used. If references are
 /// not optimized out, semantic interposition at runtime is possible.
-// RUN: %clang --sysroot=%S/Inputs -target ppc64le %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=HALF %s
+// RUN: %clang --sysroot=%S/Inputs --target=ppc64le %s -Werror -fPIC -fno-semantic-interposition -c -### 2>&1 | FileCheck --check-prefix=HALF %s
 
-// RUN: %clang --sysroot=%S/Inputs -target x86_64 %s -Werror -fPIC -c -### 2>&1 | FileCheck --check-prefix=HALF %s
+// RUN: %clang --sysroot=%S/Inputs --target=x86_64 %s -Werror -fPIC -c -### 2>&1 | FileCheck --check-prefix=HALF %s
 //
 // HALF: "-fhalf-no-semantic-interposition"
diff --git a/clang/test/Driver/fsjlj-exceptions.c b/clang/test/Driver/fsjlj-exceptions.c
index fd16a51b1f69..122513f6b611 100644
--- a/clang/test/Driver/fsjlj-exceptions.c
+++ b/clang/test/Driver/fsjlj-exceptions.c
@@ -1,6 +1,6 @@
 // RUN: %clang -target armv7-apple-ios -fexceptions -c %s -o /dev/null -### 2>&1 | FileCheck -check-prefix CHECK-IOS %s
-// RUN: %clang -target i686-windows-gnu -fexceptions -c %s -o /dev/null -### 2>&1 | FileCheck -check-prefix CHECK-MINGW-DEFAULT %s
-// RUN: %clang -target i686-windows-gnu -fexceptions -fsjlj-exceptions -c %s -o /dev/null -### 2>&1 | FileCheck -check-prefix CHECK-MINGW-SJLJ %s
+// RUN: %clang --target=i686-windows-gnu -fexceptions -c %s -o /dev/null -### 2>&1 | FileCheck --check-prefix=CHECK-MINGW-DEFAULT %s
+// RUN: %clang --target=i686-windows-gnu -fexceptions -fsjlj-exceptions -c %s -o /dev/null -### 2>&1 | FileCheck --check-prefix=CHECK-MINGW-SJLJ %s
 
 // CHECK-IOS: -exception-model=sjlj
 // CHECK-MINGW-DEFAULT-NOT: -exception-model=sjlj
diff --git a/clang/test/Driver/fuse-ld-windows.c b/clang/test/Driver/fuse-ld-windows.c
index 089f2961b75d..8a5af61c6e09 100644
--- a/clang/test/Driver/fuse-ld-windows.c
+++ b/clang/test/Driver/fuse-ld-windows.c
@@ -1,23 +1,23 @@
 // REQUIRES: system-windows
 
 // We used to require adding ".exe" suffix when cross-compiling on Windows.
-// RUN: %clang %s -### -o %t.o -target i386-unknown-linux \
+// RUN: %clang %s -### -o %t.o --target=i386-unknown-linux \
 // RUN:     -B %S/Inputs/fuse_ld_windows -fuse-ld=foo 2>&1 \
 // RUN:   | FileCheck %s
 
 // Check that the old variant still works.
-// RUN: %clang %s -### -o %t.o -target i386-unknown-linux \
+// RUN: %clang %s -### -o %t.o --target=i386-unknown-linux \
 // RUN:     -B %S/Inputs/fuse_ld_windows -fuse-ld=foo.exe 2>&1 \
 // RUN:   | FileCheck %s
 
 // With the full path, the extension can be omitted, too,
 // because Windows allows that.
-// RUN: %clang %s -### -o %t.o -target i386-unknown-linux \
+// RUN: %clang %s -### -o %t.o --target=i386-unknown-linux \
 // RUN:     -fuse-ld=%S/Inputs/fuse_ld_windows/ld.foo 2>&1 \
 // RUN:   | FileCheck %s
 
 // Check that the full path with the extension works too.
-// RUN: %clang %s -### -o %t.o -target i386-unknown-linux \
+// RUN: %clang %s -### -o %t.o --target=i386-unknown-linux \
 // RUN:     -fuse-ld=%S/Inputs/fuse_ld_windows/ld.foo.exe 2>&1 \
 // RUN:   | FileCheck %s
 
diff --git a/clang/test/Driver/fuse-ld.c b/clang/test/Driver/fuse-ld.c
index ef2f8c92a370..f807434dad10 100644
--- a/clang/test/Driver/fuse-ld.c
+++ b/clang/test/Driver/fuse-ld.c
@@ -15,88 +15,88 @@
 // CHECK-NO-WARN-NOT: warning:
 
 // RUN: %clang %s -### \
-// RUN:     -target x86_64-unknown-freebsd 2>&1 \
+// RUN:     --target=x86_64-unknown-freebsd 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-FREEBSD-LD
 // CHECK-FREEBSD-LD: ld
 
 // RUN: %clang %s -### -fuse-ld=bfd \
 // RUN:     --sysroot=%S/Inputs/basic_freebsd_tree \
-// RUN:     -target x86_64-unknown-freebsd \
+// RUN:     --target=x86_64-unknown-freebsd \
 // RUN:     -B%S/Inputs/basic_freebsd_tree/usr/bin 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=CHECK-FREEBSD-BFD
 // CHECK-FREEBSD-BFD: Inputs/basic_freebsd_tree/usr/bin{{/|\\+}}ld.bfd
 
 // RUN: %clang %s -### -fuse-ld=gold \
 // RUN:     --sysroot=%S/Inputs/basic_freebsd_tree \
-// RUN:     -target x86_64-unknown-freebsd \
+// RUN:     --target=x86_64-unknown-freebsd \
 // RUN:     -B%S/Inputs/basic_freebsd_tree/usr/bin 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=CHECK-FREEBSD-GOLD
 // CHECK-FREEBSD-GOLD: Inputs/basic_freebsd_tree/usr/bin{{/|\\+}}ld.gold
 
 // RUN: not %clang %s -### -fuse-ld=plib \
 // RUN:     --sysroot=%S/Inputs/basic_freebsd_tree \
-// RUN:     -target x86_64-unknown-freebsd \
+// RUN:     --target=x86_64-unknown-freebsd \
 // RUN:     -B%S/Inputs/basic_freebsd_tree/usr/bin 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=CHECK-FREEBSD-PLIB
 // CHECK-FREEBSD-PLIB: error: invalid linker name
 
 // RUN: %clang %s -### -fuse-ld=ld \
-// RUN:     -target arm-linux-androideabi \
+// RUN:     --target=arm-linux-androideabi \
 // RUN:     -B%S/Inputs/basic_android_tree/bin/arm-linux-androideabi- 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-ANDROID-ARM-LD
 // CHECK-ANDROID-ARM-LD: ld.lld
 
 // RUN: %clang %s -### -fuse-ld=bfd \
-// RUN:     -target arm-linux-androideabi \
+// RUN:     --target=arm-linux-androideabi \
 // RUN:     -B%S/Inputs/basic_android_tree/bin/arm-linux-androideabi- 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=CHECK-ANDROID-ARM-BFD
 // CHECK-ANDROID-ARM-BFD: Inputs/basic_android_tree/bin{{/|\\+}}arm-linux-androideabi-ld.bfd
 
 // RUN: %clang %s -### -fuse-ld=gold \
-// RUN:     -target arm-linux-androideabi \
+// RUN:     --target=arm-linux-androideabi \
 // RUN:     -B%S/Inputs/basic_android_tree/bin/arm-linux-androideabi- 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=CHECK-ANDROID-ARM-GOLD
 // CHECK-ANDROID-ARM-GOLD: Inputs/basic_android_tree/bin{{/|\\+}}arm-linux-androideabi-ld.gold
 
 // RUN: %clang %s -### -fuse-ld=ld \
-// RUN:     -target arm-linux-androideabi \
+// RUN:     --target=arm-linux-androideabi \
 // RUN:     --gcc-toolchain=%S/Inputs/basic_android_tree 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-ANDROID-ARM-LD-TC
 // CHECK-ANDROID-ARM-LD-TC: ld.lld
 
 // RUN: %clang %s -### -fuse-ld=bfd \
-// RUN:     -target arm-linux-androideabi \
+// RUN:     --target=arm-linux-androideabi \
 // RUN:     --gcc-toolchain=%S/Inputs/basic_android_tree 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=CHECK-ANDROID-ARM-BFD-TC
 // CHECK-ANDROID-ARM-BFD-TC: Inputs/basic_android_tree/lib/gcc/arm-linux-androideabi/4.4.3/../../../../arm-linux-androideabi/bin{{/|\\+}}ld.bfd
 
 // RUN: %clang %s -### -fuse-ld=gold \
-// RUN:     -target arm-linux-androideabi \
+// RUN:     --target=arm-linux-androideabi \
 // RUN:     --gcc-toolchain=%S/Inputs/basic_android_tree 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=CHECK-ANDROID-ARM-GOLD-TC
 // CHECK-ANDROID-ARM-GOLD-TC: Inputs/basic_android_tree/lib/gcc/arm-linux-androideabi/4.4.3/../../../../arm-linux-androideabi/bin{{/|\\+}}ld.gold
 
 
 // RUN: %clang %s -### -fuse-ld=link \
-// RUN:     -target i686-unknown-windows-msvc 2>&1 \
+// RUN:     --target=i686-unknown-windows-msvc 2>&1 \
 // RUN:   | FileCheck %s --check-prefix CHECK-WINDOWS-MSVC-LINK
 // CHECK-WINDOWS-MSVC-LINK: "{{.*}}link.exe"
 // CHECK-WINDOWS-MSVC-LINK-SAME: "-out:{{.*}}"
 
 // RUN: %clang %s -### -fuse-ld=lld \
-// RUN:     -target i686-unknown-windows-msvc 2>&1 \
+// RUN:     --target=i686-unknown-windows-msvc 2>&1 \
 // RUN:   | FileCheck %s --check-prefix CHECK-WINDOWS-MSVC-LLD
 // CHECK-WINDOWS-MSVC-LLD: "{{.*}}lld-link{{\.exe"|"}}
 // CHECK-WINDOWS-MSVC-LLD-SAME: "-out:{{.*}}"
 
 // RUN: %clang %s -### -fuse-ld=lld-link \
-// RUN:     -target i686-unknown-windows-msvc 2>&1 \
+// RUN:     --target=i686-unknown-windows-msvc 2>&1 \
 // RUN:   | FileCheck %s --check-prefix CHECK-WINDOWS-MSVC-LLD-LINK
 // CHECK-WINDOWS-MSVC-LLD-LINK: "{{.*}}lld-link{{\.exe"|"}}
 // CHECK-WINDOWS-MSVC-LLD-LINK-SAME: "-out:{{.*}}"
 
 // RUN: %clang %s -### -fuse-ld=bfd \
-// RUN:     -target i686-unknown-windows-msvc \
+// RUN:     --target=i686-unknown-windows-msvc \
 // RUN:     -B %S/Inputs/Windows/usr/bin 2>&1 \
 // RUN:   | FileCheck %s --check-prefix CHECK-WINDOWS-MSVC-BFD
 // CHECK-WINDOWS-MSVC-BFD: "{{.*}}ld.bfd"
diff --git a/clang/test/Driver/fuzzer.c b/clang/test/Driver/fuzzer.c
index 14caf7690057..409fbfac8ce1 100644
--- a/clang/test/Driver/fuzzer.c
+++ b/clang/test/Driver/fuzzer.c
@@ -8,7 +8,7 @@
 // CHECK-COVERAGE-SAME: -fsanitize-coverage-pc-table
 // CHECK-FUZZER-LIB: libclang_rt.fuzzer
 
-// RUN: %clang -fsanitize=fuzzer -target i386-unknown-linux -stdlib=platform %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LIBCXX-LINUX %s
+// RUN: %clang -fsanitize=fuzzer --target=i386-unknown-linux -stdlib=platform %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LIBCXX-LINUX %s
 //
 // CHECK-LIBCXX-LINUX: -lstdc++
 
@@ -29,18 +29,18 @@
 // Check that we respect whether thes tandard library should be linked
 // statically.
 //
-// RUN: %clang -fsanitize=fuzzer -target i386-unknown-linux -stdlib=libstdc++ %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LIBSTDCXX-DYNAMIC %s
+// RUN: %clang -fsanitize=fuzzer --target=i386-unknown-linux -stdlib=libstdc++ %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LIBSTDCXX-DYNAMIC %s
 // CHECK-LIBSTDCXX-DYNAMIC-NOT: -Bstatic
 // CHECK-LIBSTDCXX-DYNAMIC: -lstdc++
 //
-// RUN: %clang -fsanitize=fuzzer -target i386-unknown-linux -stdlib=libstdc++ -static-libstdc++ %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LIBSTDCXX-STATIC %s
+// RUN: %clang -fsanitize=fuzzer --target=i386-unknown-linux -stdlib=libstdc++ -static-libstdc++ %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LIBSTDCXX-STATIC %s
 // CHECK-LIBSTDCXX-STATIC: "-Bstatic" "-lstdc++"
 //
-// RUN: %clang -fsanitize=fuzzer -target i386-unknown-linux -stdlib=libc++ %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LIBCXX-DYNAMIC %s
+// RUN: %clang -fsanitize=fuzzer --target=i386-unknown-linux -stdlib=libc++ %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LIBCXX-DYNAMIC %s
 // CHECK-LIBCXX-DYNAMIC-NOT: -Bstatic
 // CHECK-LIBCXX-DYNAMIC: -lc++
 //
-// RUN: %clang -fsanitize=fuzzer -target i386-unknown-linux -stdlib=libc++ -static-libstdc++ %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LIBCXX-STATIC %s
+// RUN: %clang -fsanitize=fuzzer --target=i386-unknown-linux -stdlib=libc++ -static-libstdc++ %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LIBCXX-STATIC %s
 // CHECK-LIBCXX-STATIC: "-Bstatic" "-lc++"
 
 int LLVMFuzzerTestOneInput(const char *Data, long Size) {
diff --git a/clang/test/Driver/fveclib.c b/clang/test/Driver/fveclib.c
index 8a230284bcdf..9b0f1ce13aa2 100644
--- a/clang/test/Driver/fveclib.c
+++ b/clang/test/Driver/fveclib.c
@@ -1,11 +1,11 @@
-// RUN: %clang -### -c -fveclib=none %s 2>&1 | FileCheck -check-prefix CHECK-NOLIB %s
-// RUN: %clang -### -c -fveclib=Accelerate %s 2>&1 | FileCheck -check-prefix CHECK-ACCELERATE %s
-// RUN: %clang -### -c -fveclib=libmvec %s 2>&1 | FileCheck -check-prefix CHECK-libmvec %s
-// RUN: %clang -### -c -fveclib=MASSV %s 2>&1 | FileCheck -check-prefix CHECK-MASSV %s
-// RUN: %clang -### -c -fveclib=Darwin_libsystem_m %s 2>&1 | FileCheck -check-prefix CHECK-DARWIN_LIBSYSTEM_M %s
-// RUN: %clang -### -c --target=aarch64-none-none -fveclib=SLEEF %s 2>&1 | FileCheck -check-prefix CHECK-SLEEF %s
-// RUN: %clang -### -c --target=aarch64-none-none -fveclib=ArmPL %s 2>&1 | FileCheck -check-prefix CHECK-ARMPL %s
-// RUN: not %clang -c -fveclib=something %s 2>&1 | FileCheck -check-prefix CHECK-INVALID %s
+// RUN: %clang -### -c -fveclib=none %s 2>&1 | FileCheck --check-prefix=CHECK-NOLIB %s
+// RUN: %clang -### -c -fveclib=Accelerate %s 2>&1 | FileCheck --check-prefix=CHECK-ACCELERATE %s
+// RUN: %clang -### -c -fveclib=libmvec %s 2>&1 | FileCheck --check-prefix=CHECK-libmvec %s
+// RUN: %clang -### -c -fveclib=MASSV %s 2>&1 | FileCheck --check-prefix=CHECK-MASSV %s
+// RUN: %clang -### -c -fveclib=Darwin_libsystem_m %s 2>&1 | FileCheck --check-prefix=CHECK-DARWIN_LIBSYSTEM_M %s
+// RUN: %clang -### -c --target=aarch64 -fveclib=SLEEF %s 2>&1 | FileCheck --check-prefix=CHECK-SLEEF %s
+// RUN: %clang -### -c --target=aarch64 -fveclib=ArmPL %s 2>&1 | FileCheck --check-prefix=CHECK-ARMPL %s
+// RUN: not %clang -c -fveclib=something %s 2>&1 | FileCheck --check-prefix=CHECK-INVALID %s
 
 // CHECK-NOLIB: "-fveclib=none"
 // CHECK-ACCELERATE: "-fveclib=Accelerate"
@@ -17,10 +17,10 @@
 
 // CHECK-INVALID: error: invalid value 'something' in '-fveclib=something'
 
-// RUN: not %clang --target=x86-none-none -c -fveclib=SLEEF %s 2>&1 | FileCheck -check-prefix CHECK-ERROR %s
-// RUN: not %clang --target=x86-none-none -c -fveclib=ArmPL %s 2>&1 | FileCheck -check-prefix CHECK-ERROR %s
-// RUN: not %clang --target=aarch64-none-none -c -fveclib=LIBMVEC-X86 %s 2>&1 | FileCheck -check-prefix CHECK-ERROR %s
-// RUN: not %clang --target=aarch64-none-none -c -fveclib=SVML %s 2>&1 | FileCheck -check-prefix CHECK-ERROR %s
+// RUN: not %clang --target=x86 -c -fveclib=SLEEF %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+// RUN: not %clang --target=x86 -c -fveclib=ArmPL %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+// RUN: not %clang --target=aarch64 -c -fveclib=LIBMVEC-X86 %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
+// RUN: not %clang --target=aarch64 -c -fveclib=SVML %s 2>&1 | FileCheck --check-prefix=CHECK-ERROR %s
 // CHECK-ERROR: unsupported option {{.*}} for target
 
 // RUN: %clang -fveclib=Accelerate %s -target arm64-apple-ios8.0.0 -### 2>&1 | FileCheck --check-prefix=CHECK-LINK %s
@@ -35,17 +35,17 @@
 
 /* Verify that the correct vector library is passed to LTO flags. */
 
-// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=LIBMVEC -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-LIBMVEC %s
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=LIBMVEC -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-LIBMVEC %s
 // CHECK-LTO-LIBMVEC: "-plugin-opt=-vector-library=LIBMVEC-X86"
 
-// RUN: %clang -### --target=powerpc64-unknown-linux-gnu -fveclib=MASSV -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-MASSV %s
+// RUN: %clang -### --target=powerpc64-unknown-linux-gnu -fveclib=MASSV -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-MASSV %s
 // CHECK-LTO-MASSV: "-plugin-opt=-vector-library=MASSV"
 
-// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=SVML -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-SVML %s
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fveclib=SVML -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-SVML %s
 // CHECK-LTO-SVML: "-plugin-opt=-vector-library=SVML"
 
-// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=SLEEF -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-SLEEF %s
+// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=SLEEF -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-SLEEF %s
 // CHECK-LTO-SLEEF: "-plugin-opt=-vector-library=sleefgnuabi"
 
-// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=ArmPL -flto %s 2>&1 | FileCheck -check-prefix CHECK-LTO-ARMPL %s
+// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=ArmPL -flto %s 2>&1 | FileCheck --check-prefix=CHECK-LTO-ARMPL %s
 // CHECK-LTO-ARMPL: "-plugin-opt=-vector-library=ArmPL"
diff --git a/clang/test/Driver/loongarch-mlasx-error.c b/clang/test/Driver/loongarch-mlasx-error.c
index e66f277f7c29..1d88f0f1a7c6 100644
--- a/clang/test/Driver/loongarch-mlasx-error.c
+++ b/clang/test/Driver/loongarch-mlasx-error.c
@@ -11,5 +11,5 @@
 // RUN: not %clang --target=loongarch64 %s -fsyntax-only -mlasx -mno-lsx 2>&1 \
 // RUN:   FileCheck --check-prefix=ERROR_LASX_FPU128 %s
 
-// ERROR_LASX_FPU64: error: wrong fpu width; LASX depends on 64-bit FPU.
-// ERROR_LASX_FPU128: error: invalid option combination; LASX depends on LSX.
+// ERROR_LASX_FPU64: error: wrong fpu width; LASX depends on 64-bit FPU
+// ERROR_LASX_FPU128: error: invalid option combination; LASX depends on LSX
diff --git a/clang/test/Driver/loongarch-mlsx-error.c b/clang/test/Driver/loongarch-mlsx-error.c
index bd6b8e2718bf..db1f6fb2e5a0 100644
--- a/clang/test/Driver/loongarch-mlsx-error.c
+++ b/clang/test/Driver/loongarch-mlsx-error.c
@@ -9,4 +9,4 @@
 // RUN: not %clang --target=loongarch64 %s -fsyntax-only -mlsx -mfpu=none 2>&1 \
 // RUN:   FileCheck --check-prefix=ERROR_LSX_FPU64 %s
 
-// ERROR_LSX_FPU64: error: wrong fpu width; LSX depends on 64-bit FPU.
+// ERROR_LSX_FPU64: error: wrong fpu width; LSX depends on 64-bit FPU
diff --git a/clang/test/Driver/ms-define-stdc.c b/clang/test/Driver/ms-define-stdc.c
new file mode 100644
index 000000000000..d5e873d21a76
--- /dev/null
+++ b/clang/test/Driver/ms-define-stdc.c
@@ -0,0 +1,11 @@
+// Note: %s must be preceded by --, otherwise it may be interpreted as a
+// command-line option, e.g. on Mac where %s is commonly under /Users.
+//
+// Note: see also cl-zc.cpp
+
+// RUN: %clang_cl /TC /dev/null /E -Xclang -dM /Zc:__STDC__- 2>&1 | FileCheck %s --check-prefix=ZCSTDCIGNORED
+// ZCSTDCIGNORED-NOT: #define __STDC__ 1
+// ZCSTDCIGNORED: argument unused during compilation
+
+// RUN: not %clang -Xclang -fno-ms-define-stdc %s 2>&1 | FileCheck %s --check-prefix="NOARG"
+// NOARG: error: unknown argument: '-fno-ms-define-stdc'
diff --git a/clang/test/Driver/openmp-offload-infer.c b/clang/test/Driver/openmp-offload-infer.c
index 50333293eb7d..388860abc01a 100644
--- a/clang/test/Driver/openmp-offload-infer.c
+++ b/clang/test/Driver/openmp-offload-infer.c
@@ -43,7 +43,7 @@
 // RUN:     --offload-arch=sm_70 --offload-arch=gfx908 --offload-arch=skylake \
 // RUN:     -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-FAILED
 
-// CHECK-FAILED: error: failed to deduce triple for target architecture 'skylake'; specify the triple using '-fopenmp-targets' and '-Xopenmp-target' instead.
+// CHECK-FAILED: error: failed to deduce triple for target architecture 'skylake'; specify the triple using '-fopenmp-targets' and '-Xopenmp-target' instead
 
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -fopenmp=libomp \
 // RUN:     --offload-arch=sm_70 --offload-arch=gfx908 -fno-openmp \
diff --git a/clang/test/Driver/openmp-system-arch.c b/clang/test/Driver/openmp-system-arch.c
index 4e024e6b11d1..a48c1e76fa75 100644
--- a/clang/test/Driver/openmp-system-arch.c
+++ b/clang/test/Driver/openmp-system-arch.c
@@ -31,7 +31,7 @@
 // RUN:   not %clang -### --target=x86_64-unknown-linux-gnu -nogpulib -fopenmp=libomp --offload-arch= \
 // RUN:     --nvptx-arch-tool=%t/nvptx_arch_empty --amdgpu-arch-tool=%t/amdgpu_arch_empty %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=NO-OUTPUT-ERROR
-// NO-OUTPUT-ERROR: error: failed to deduce triple for target architecture 'native'; specify the triple using '-fopenmp-targets' and '-Xopenmp-target' instead.
+// NO-OUTPUT-ERROR: error: failed to deduce triple for target architecture 'native'; specify the triple using '-fopenmp-targets' and '-Xopenmp-target' instead
 
 // case when amdgpu-arch succeeds.
 // RUN:   %clang -### --target=x86_64-unknown-linux-gnu -nogpulib -fopenmp=libomp --offload-arch=native \
diff --git a/clang/test/Driver/tocdata-cc1.c b/clang/test/Driver/tocdata-cc1.c
index fe0d97ea02db..e00383deecef 100644
--- a/clang/test/Driver/tocdata-cc1.c
+++ b/clang/test/Driver/tocdata-cc1.c
@@ -1,16 +1,13 @@
 // RUN: %clang -### --target=powerpc-ibm-aix-xcoff -mcmodel=medium -mtocdata %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-NOTOC %s
+// RUN:   | FileCheck %s
 // RUN: %clang -### --target=powerpc-ibm-aix-xcoff -mcmodel=large -mtocdata %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-NOTOC %s
+// RUN:   | FileCheck %s
 // RUN: %clang -### --target=powerpc-ibm-aix-xcoff -mtocdata %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-TOC %s
+// RUN:   | FileCheck %s
 // RUN: %clang -### --target=powerpc64-ibm-aix-xcoff -mcmodel=medium -mtocdata %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-NOTOC %s
+// RUN:   | FileCheck %s
 // RUN: %clang -### --target=powerpc64-ibm-aix-xcoff -mcmodel=large -mtocdata %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-NOTOC %s
+// RUN:   | FileCheck %s
 // RUN: %clang -### --target=powerpc64-ibm-aix-xcoff -mtocdata %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-TOC %s
-// CHECK-NOTOC: warning: ignoring '-mtocdata' as it is only supported for -mcmodel=small
-// CHECK-NOTOC-NOT: "-cc1"{{.*}}" "-mtocdata"
-// CHECK-TOC: "-cc1"{{.*}}" "-mtocdata"
-// CHECK-TOC-NOT: warning: ignoring '-mtocdata' as it is only supported for -mcmodel=small
+// RUN:   | FileCheck %s
+// CHECK: "-cc1"{{.*}}" "-mtocdata"
diff --git a/clang/test/Driver/x-args.c b/clang/test/Driver/x-args.c
index 17bb5d99404d..06c9c7a46156 100644
--- a/clang/test/Driver/x-args.c
+++ b/clang/test/Driver/x-args.c
@@ -6,6 +6,4 @@
 // RUN: %clang -fsyntax-only %s -xc %s -xc++ -fsyntax-only 2>&1 | FileCheck %s
 // CHECK: '-x c++' after last input file has no effect
 
-// RUN: not %clang_cl /WX /clang:-xc /clang:-E /clang:-dM -- %s 2>&1 | FileCheck --implicit-check-not="error:" -check-prefix=CL %s
-// RUN: not %clang_cl /TC /WX /clang:-xc /clang:-E /clang:-dM -- %s 2>&1 | FileCheck --implicit-check-not="error:" -check-prefix=CL %s
-// CL: error: unsupported option '-x c'; did you mean '/TC' or '/TP'?
+// RUN: %clang_cl -fsyntax-only /WX -xc++ -- %s
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index 25f8f66bc321..1d5f001c23fc 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -21,10 +21,10 @@
 // SSE4-AES: "-target-feature" "+sse4.2" "-target-feature" "+aes"
 // NO-SSE4-AES: "-target-feature" "-sse4.1" "-target-feature" "-aes"
 
-// RUN: %clang --target=i386 -march=i386 -mavx -mavx2 -mavx512f -mavx512cd -mavx512er -mavx512pf -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512ifma %s -### 2>&1 | FileCheck -check-prefix=AVX %s
-// RUN: %clang --target=i386 -march=i386 -mno-avx -mno-avx2 -mno-avx512f -mno-avx512cd -mno-avx512er -mno-avx512pf -mno-avx512dq -mno-avx512bw -mno-avx512vl -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512ifma %s -### 2>&1 | FileCheck -check-prefix=NO-AVX %s
-// AVX: "-target-feature" "+avx" "-target-feature" "+avx2" "-target-feature" "+avx512f" "-target-feature" "+avx512cd" "-target-feature" "+avx512er" "-target-feature" "+avx512pf" "-target-feature" "+avx512dq" "-target-feature" "+avx512bw" "-target-feature" "+avx512vl" "-target-feature" "+avx512vbmi" "-target-feature" "+avx512vbmi2" "-target-feature" "+avx512ifma"
-// NO-AVX: "-target-feature" "-avx" "-target-feature" "-avx2" "-target-feature" "-avx512f" "-target-feature" "-avx512cd" "-target-feature" "-avx512er" "-target-feature" "-avx512pf" "-target-feature" "-avx512dq" "-target-feature" "-avx512bw" "-target-feature" "-avx512vl" "-target-feature" "-avx512vbmi" "-target-feature" "-avx512vbmi2" "-target-feature" "-avx512ifma"
+// RUN: %clang --target=i386 -march=i386 -mavx -mavx2 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512vbmi2 -mavx512ifma %s -### 2>&1 | FileCheck -check-prefix=AVX %s
+// RUN: %clang --target=i386 -march=i386 -mno-avx -mno-avx2 -mno-avx512f -mno-avx512cd -mno-avx512dq -mno-avx512bw -mno-avx512vl -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512ifma %s -### 2>&1 | FileCheck -check-prefix=NO-AVX %s
+// AVX: "-target-feature" "+avx" "-target-feature" "+avx2" "-target-feature" "+avx512f" "-target-feature" "+avx512cd" "-target-feature" "+avx512dq" "-target-feature" "+avx512bw" "-target-feature" "+avx512vl" "-target-feature" "+avx512vbmi" "-target-feature" "+avx512vbmi2" "-target-feature" "+avx512ifma"
+// NO-AVX: "-target-feature" "-avx" "-target-feature" "-avx2" "-target-feature" "-avx512f" "-target-feature" "-avx512cd" "-target-feature" "-avx512dq" "-target-feature" "-avx512bw" "-target-feature" "-avx512vl" "-target-feature" "-avx512vbmi" "-target-feature" "-avx512vbmi2" "-target-feature" "-avx512ifma"
 
 // RUN: %clang --target=i386 -march=i386 -mpclmul -mrdrnd -mfsgsbase -mbmi -mbmi2 %s -### 2>&1 | FileCheck -check-prefix=BMI %s
 // RUN: %clang --target=i386 -march=i386 -mno-pclmul -mno-rdrnd -mno-fsgsbase -mno-bmi -mno-bmi2 %s -### 2>&1 | FileCheck -check-prefix=NO-BMI %s
@@ -86,11 +86,6 @@
 // SGX: "-target-feature" "+sgx"
 // NO-SGX: "-target-feature" "-sgx"
 
-// RUN: %clang --target=i386 -march=i386 -mprefetchwt1 %s -### 2>&1 | FileCheck -check-prefix=PREFETCHWT1 %s
-// RUN: %clang --target=i386 -march=i386 -mno-prefetchwt1 %s -### 2>&1 | FileCheck -check-prefix=NO-PREFETCHWT1 %s
-// PREFETCHWT1: "-target-feature" "+prefetchwt1"
-// NO-PREFETCHWT1: "-target-feature" "-prefetchwt1"
-
 // RUN: %clang --target=i386 -march=i386 -mprefetchi %s -### -o %t.o 2>&1 | FileCheck -check-prefix=PREFETCHI %s
 // RUN: %clang --target=i386 -march=i386 -mno-prefetchi %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-PREFETCHI %s
 // PREFETCHI: "-target-feature" "+prefetchi"
diff --git a/clang/test/ExtractAPI/non_type_template.cpp b/clang/test/ExtractAPI/non_type_template.cpp
index 4e65eb790ca1..85f38e39c82b 100644
--- a/clang/test/ExtractAPI/non_type_template.cpp
+++ b/clang/test/ExtractAPI/non_type_template.cpp
@@ -310,4 +310,48 @@ NestedTemplateTemplateParamPack<Bar, Bar> var;
 // VAR-NEXT:   }
 // VAR-NEXT: ]
 
+template <typename T>
+class TypeContainer {
+  public:
+    // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix TYPE
+    typedef Foo<T> Type;
+// TYPE-LABEL: "!testLabel": "c:non_type_template.cpp@ST>1#T@TypeContainer@T@Type",
+// TYPE:      "declarationFragments": [
+// TYPE-NEXT:   {
+// TYPE-NEXT:     "kind": "keyword",
+// TYPE-NEXT:     "spelling": "typedef"
+// TYPE-NEXT:   },
+// TYPE-NEXT:   {
+// TYPE-NEXT:     "kind": "text",
+// TYPE-NEXT:     "spelling": " "
+// TYPE-NEXT:   },
+// TYPE-NEXT:   {
+// TYPE-NEXT:     "kind": "typeIdentifier",
+// TYPE-NEXT:     "preciseIdentifier": "c:@ST>2#T#NI@Foo",
+// TYPE-NEXT:     "spelling": "Foo"
+// TYPE-NEXT:   },
+// TYPE-NEXT:   {
+// TYPE-NEXT:     "kind": "text",
+// TYPE-NEXT:     "spelling": "<"
+// TYPE-NEXT:   },
+// TYPE-NEXT:   {
+// TYPE-NEXT:     "kind": "typeIdentifier",
+// TYPE-NEXT:     "preciseIdentifier": "c:t0.0",
+// TYPE-NEXT:     "spelling": "T"
+// TYPE-NEXT:   },
+// TYPE-NEXT:   {
+// TYPE-NEXT:     "kind": "text",
+// TYPE-NEXT:     "spelling": "> "
+// TYPE-NEXT:   },
+// TYPE-NEXT:   {
+// TYPE-NEXT:     "kind": "identifier",
+// TYPE-NEXT:     "spelling": "Type"
+// TYPE-NEXT:   },
+// TYPE-NEXT:   {
+// TYPE-NEXT:     "kind": "text",
+// TYPE-NEXT:     "spelling": ";"
+// TYPE-NEXT:   }
+// TYPE-NEXT: ]
+};
+
 // expected-no-diagnostics
diff --git a/clang/test/Frontend/optimization-remark-options.c b/clang/test/Frontend/optimization-remark-options.c
index 96e480d140be..357273a65063 100644
--- a/clang/test/Frontend/optimization-remark-options.c
+++ b/clang/test/Frontend/optimization-remark-options.c
@@ -1,7 +1,7 @@
 // REQUIRES: x86-registered-target
 // RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -mllvm -vectorize-memory-check-threshold=8 -Rpass-analysis=loop-vectorize -emit-llvm -S %s -o - 2>&1 | FileCheck %s
 
-// CHECK: {{.*}}:10:11: remark: loop not vectorized: cannot prove it is safe to reorder floating-point operations; allow reordering by specifying '#pragma clang loop vectorize(enable)' before the loop or by providing the compiler option '-ffast-math'.
+// CHECK: {{.*}}:10:11: remark: loop not vectorized: cannot prove it is safe to reorder floating-point operations; allow reordering by specifying '#pragma clang loop vectorize(enable)' before the loop or by providing the compiler option '-ffast-math'
 
 double foo(int N) {
   double v = 0.0;
@@ -12,7 +12,7 @@ double foo(int N) {
   return v;
 }
 
-// CHECK: {{.*}}:18:3: remark: loop not vectorized: cannot prove it is safe to reorder memory operations; allow reordering by specifying '#pragma clang loop vectorize(enable)' before the loop. If the arrays will always be independent specify '#pragma clang loop vectorize(assume_safety)' before the loop or provide the '__restrict__' qualifier with the independent array arguments. Erroneous results will occur if these options are incorrectly applied!
+// CHECK: {{.*}}:18:3: remark: loop not vectorized: cannot prove it is safe to reorder memory operations; allow reordering by specifying '#pragma clang loop vectorize(enable)' before the loop; if the arrays will always be independent, specify '#pragma clang loop vectorize(assume_safety)' before the loop or provide the '__restrict__' qualifier with the independent array arguments -- erroneous results will occur if these options are incorrectly applied
 
 void foo2(int *dw, int *uw, int *A, int *B, int *C, int *D, int N) {
   for (long i = 0; i < N; i++) {
diff --git a/clang/test/Frontend/x86-target-cpu.c b/clang/test/Frontend/x86-target-cpu.c
index 6b99b2c8574a..6c8502ac2c21 100644
--- a/clang/test/Frontend/x86-target-cpu.c
+++ b/clang/test/Frontend/x86-target-cpu.c
@@ -15,14 +15,8 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu cannonlake -verify %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu icelake-client -verify %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu icelake-server -verify %s
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu knl -verify=knl %s
-// knl-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
-// knl-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
-// knl-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu knm -verify=knm %s
-// knm-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
-// knm-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
-// knm-warning@*:* {{KNL, KNM related Intel Xeon Phi CPU's specific ISA's supports will be removed in LLVM 19.}}
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu knl -verify %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu knm -verify %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu bonnell -verify %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu silvermont -verify %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu k8 -verify %s
diff --git a/clang/test/InstallAPI/binary-attributes.test b/clang/test/InstallAPI/binary-attributes.test
index b28e99f64454..fd9ff12998a3 100644
--- a/clang/test/InstallAPI/binary-attributes.test
+++ b/clang/test/InstallAPI/binary-attributes.test
@@ -30,13 +30,13 @@
 ; RUN: -install_name /System/Library/Frameworks/Simple.framework/Versions/A/Simple \
 ; RUN: -current_version 1.2.3  -compatibility_version 1 -fapplication-extension \
 ; RUN: -o tmp.tbd --verify-against=%t/Simple 2>&1 | FileCheck -check-prefix=APPEXTSAFE %s
-; APPEXTSAFE: error: ApplicationExtensionSafe flag does not match: 'true' (provided) vs 'false' (found)
+; APPEXTSAFE: error: the ApplicationExtensionSafe flag does not match: 'true' (provided) vs 'false' (found)
 
 ; RUN: not clang-installapi -target x86_64-apple-macos10.12 \
 ; RUN: -install_name /System/Library/Frameworks/Simple.framework/Versions/A/Simple \
 ; RUN: -current_version 1.2.3  -compatibility_version 1 -not_for_dyld_shared_cache \
 ; RUN: -o tmp.tbd --verify-against=%t/Simple 2>&1 | FileCheck -check-prefix=SHARED_CACHE %s
-; SHARED_CACHE: error: NotForDyldSharedCache flag does not match: 'true' (provided) vs 'false' (found)
+; SHARED_CACHE: error: the NotForDyldSharedCache flag does not match: 'true' (provided) vs 'false' (found)
 
 ; RUN: not clang-installapi -target x86_64-apple-macos10.12 \
 ; RUN: -install_name /System/Library/Frameworks/Simple.framework/Versions/A/Simple \
diff --git a/clang/test/Lexer/cxx-features.cpp b/clang/test/Lexer/cxx-features.cpp
index 41550cf02aa3..4c2aa3ae2c54 100644
--- a/clang/test/Lexer/cxx-features.cpp
+++ b/clang/test/Lexer/cxx-features.cpp
@@ -1,17 +1,17 @@
 // RUN: %clang_cc1 -std=c++98 -fcxx-exceptions -verify %s
 // RUN: %clang_cc1 -std=c++11 -fcxx-exceptions -verify %s
-// RUN: %clang_cc1 -std=c++14 -fcxx-exceptions -fsized-deallocation -verify %s
-// RUN: %clang_cc1 -std=c++17 -fcxx-exceptions -fsized-deallocation -verify %s
-// RUN: %clang_cc1 -std=c++20 -fcxx-exceptions -fsized-deallocation -verify %s
-// RUN: %clang_cc1 -std=c++23 -fcxx-exceptions -fsized-deallocation -verify %s
-// RUN: %clang_cc1 -std=c++2c -fcxx-exceptions -fsized-deallocation -verify %s
+// RUN: %clang_cc1 -std=c++14 -fcxx-exceptions -verify %s
+// RUN: %clang_cc1 -std=c++17 -fcxx-exceptions -verify %s
+// RUN: %clang_cc1 -std=c++20 -fcxx-exceptions -verify %s
+// RUN: %clang_cc1 -std=c++23 -fcxx-exceptions -verify %s
+// RUN: %clang_cc1 -std=c++2c -fcxx-exceptions -verify %s
 
 //
-// RUN: %clang_cc1 -std=c++17 -fcxx-exceptions -fsized-deallocation -fno-relaxed-template-template-args -DNO_RELAXED_TEMPLATE_TEMPLATE_ARGS=1 -verify %s
-// RUN: %clang_cc1 -std=c++17 -fcxx-exceptions -fsized-deallocation -DCONCEPTS_TS=1 -verify %s
-// RUN: %clang_cc1 -std=c++14 -fno-rtti -fno-threadsafe-statics -verify %s -DNO_EXCEPTIONS -DNO_RTTI -DNO_THREADSAFE_STATICS -fsized-deallocation
-// RUN: %clang_cc1 -std=c++14 -fchar8_t -DNO_EXCEPTIONS -DCHAR8_T -verify -fsized-deallocation %s
-// RUN: %clang_cc1 -std=c++2a -fno-char8_t -DNO_EXCEPTIONS -DNO_CHAR8_T -verify -fsized-deallocation %s
+// RUN: %clang_cc1 -std=c++17 -fcxx-exceptions -fno-relaxed-template-template-args -DNO_RELAXED_TEMPLATE_TEMPLATE_ARGS=1 -verify %s
+// RUN: %clang_cc1 -std=c++17 -fcxx-exceptions -DCONCEPTS_TS=1 -verify %s
+// RUN: %clang_cc1 -std=c++14 -fno-rtti -fno-threadsafe-statics -verify %s -DNO_EXCEPTIONS -DNO_RTTI -DNO_THREADSAFE_STATICS
+// RUN: %clang_cc1 -std=c++14 -fchar8_t -DNO_EXCEPTIONS -DCHAR8_T -verify %s
+// RUN: %clang_cc1 -std=c++2a -fno-char8_t -DNO_EXCEPTIONS -DNO_CHAR8_T -verify %s
 
 // expected-no-diagnostics
 
diff --git a/clang/test/Misc/diag-template-diffing.cpp b/clang/test/Misc/diag-template-diffing-cxx11.cpp
index eefeb0b1117c..eefeb0b1117c 100644
--- a/clang/test/Misc/diag-template-diffing.cpp
+++ b/clang/test/Misc/diag-template-diffing-cxx11.cpp
diff --git a/clang/test/Misc/diag-template-diffing-cxx26.cpp b/clang/test/Misc/diag-template-diffing-cxx26.cpp
new file mode 100644
index 000000000000..2b6dd86a9885
--- /dev/null
+++ b/clang/test/Misc/diag-template-diffing-cxx26.cpp
@@ -0,0 +1,49 @@
+// RUN: %clang_cc1 -fsyntax-only %s -std=c++26                                                  -verify=expected,notree
+// RUN: %clang_cc1 -fsyntax-only %s -std=c++26 -fno-elide-type                                  -verify=expected,notree
+// RUN: %clang_cc1 -fsyntax-only %s -std=c++26                 -fdiagnostics-show-template-tree -verify=expected,tree
+// RUN: %clang_cc1 -fsyntax-only %s -std=c++26 -fno-elide-type -fdiagnostics-show-template-tree -verify=expected,tree
+
+namespace GH93068 {
+  int n[2];
+
+  template <auto> struct A {}; // #A
+
+  namespace t1 {
+    // notree-error@#1 {{no viable conversion from 'A<0>' to 'A<n + 1>'}}
+
+    /* tree-error@#1 {{no viable conversion
+  A<
+    [0 != n + 1]>}}*/
+
+    A<n + 1> v1 = A<0>(); // #1
+    // expected-note@#A {{no known conversion from 'A<0>' to 'const A<&n[1]> &' for 1st argument}}
+    // expected-note@#A {{no known conversion from 'A<0>' to 'A<&n[1]> &&' for 1st argument}}
+
+    // notree-error@#2 {{no viable conversion from 'A<n>' to 'A<n + 1>'}}
+    /* tree-error@#2 {{no viable conversion
+  A<
+    [n != n + 1]>}}*/
+
+    A<n + 1> v2 = A<n>(); // #2
+    // expected-note@#A {{no known conversion from 'A<n>' to 'const A<&n[1]> &' for 1st argument}}
+    // expected-note@#A {{no known conversion from 'A<n>' to 'A<&n[1]> &&' for 1st argument}}
+  } // namespace t1
+
+  namespace t2 {
+    A<n> v1;
+    A<n + 1> v2;
+
+    // notree-note@#A {{no known conversion from 'A<n>' to 'const A<(no argument)>' for 1st argument}}
+    // notree-note@#A {{no known conversion from 'A<n>' to 'A<(no argument)>' for 1st argument}}
+
+    /* tree-note@#A {{no known conversion from argument type to parameter type for 1st argument
+  [(no qualifiers) != const] A<
+    [n != (no argument)]>}}*/
+
+    /* tree-note@#A {{no known conversion from argument type to parameter type for 1st argument
+  A<
+    [n != (no argument)]>}}*/
+
+    void f() { v2 = v1; } // expected-error {{no viable overloaded '='}}
+  } // namespace t2
+} // namespace GH93068
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index fd0e6d71baa8..99732694f72a 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -63,7 +63,6 @@
 // CHECK-NEXT: CoroOnlyDestroyWhenComplete (SubjectMatchRule_record)
 // CHECK-NEXT: CoroReturnType (SubjectMatchRule_record)
 // CHECK-NEXT: CoroWrapper (SubjectMatchRule_function)
-// CHECK-NEXT: CountedBy (SubjectMatchRule_field)
 // CHECK-NEXT: DLLExport (SubjectMatchRule_function, SubjectMatchRule_variable, SubjectMatchRule_record, SubjectMatchRule_objc_interface)
 // CHECK-NEXT: DLLImport (SubjectMatchRule_function, SubjectMatchRule_variable, SubjectMatchRule_record, SubjectMatchRule_objc_interface)
 // CHECK-NEXT: Destructor (SubjectMatchRule_function)
diff --git a/clang/test/Modules/implicit-module-remap.cpp b/clang/test/Modules/implicit-module-remap.cpp
new file mode 100644
index 000000000000..47927b969401
--- /dev/null
+++ b/clang/test/Modules/implicit-module-remap.cpp
@@ -0,0 +1,21 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: cd %t
+//
+// RUN: %clang_cc1 -fmodules -fmodule-map-file=module.modulemap -fmodules-cache-path=%t -remap-file "test.cpp;%t/test.cpp"  %t/test.cpp
+
+//--- a.h
+#define FOO
+
+//--- module.modulemap
+module a {
+  header "a.h"
+}
+
+//--- test.cpp
+#include "a.h"
+
+#ifndef FOO
+#error foo
+#endif
+
diff --git a/clang/test/OpenMP/assumes_codegen.cpp b/clang/test/OpenMP/assumes_codegen.cpp
index 4a2518a51ec3..4206e5a9caab 100644
--- a/clang/test/OpenMP/assumes_codegen.cpp
+++ b/clang/test/OpenMP/assumes_codegen.cpp
@@ -67,46 +67,46 @@ int lambda_outer() {
 }
 #pragma omp end assumes
 
-// AST:      void foo() __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) {
-// AST-NEXT: }
-// AST-NEXT: class BAR {
-// AST-NEXT: public:
-// AST-NEXT:     __attribute__((assume("ompx_range_bar_only"))) __attribute__((assume("ompx_range_bar_only_2"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) BAR()      {
-// AST-NEXT:     }
-// AST-NEXT:     __attribute__((assume("ompx_range_bar_only"))) __attribute__((assume("ompx_range_bar_only_2"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) void bar1()     {
-// AST-NEXT:     }
-// AST-NEXT:     __attribute__((assume("ompx_range_bar_only"))) __attribute__((assume("ompx_range_bar_only_2"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) static void bar2()      {
-// AST-NEXT:     }
-// AST-NEXT: };
-// AST-NEXT:  __attribute__((assume("ompx_range_bar_only"))) __attribute__((assume("ompx_range_bar_only_2"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) void bar() {
-// AST-NEXT:     BAR b;
-// AST-NEXT: }
-// AST-NEXT: __attribute__((assume("ompx_1234"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) void baz();
-// AST-NEXT: template <typename T> class BAZ {
-// AST-NEXT: public:
-// AST-NEXT:     __attribute__((assume("ompx_1234"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) BAZ<T>()      {
-// AST-NEXT:     }
-// AST-NEXT:     __attribute__((assume("ompx_1234"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp")))  void baz1()     {
-// AST-NEXT:     }
-// AST-NEXT:     __attribute__((assume("ompx_1234"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) static void baz2()      {
-// AST-NEXT:     }
-// AST-NEXT: };
-// AST-NEXT: template<> class BAZ<float> {
-// AST-NEXT: public:
-// AST-NEXT:     __attribute__((assume("ompx_1234"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp")))  BAZ()    {
-// AST-NEXT:     }
-// AST-NEXT:     __attribute__((assume("ompx_1234"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) void baz1();
-// AST-NEXT:     __attribute__((assume("ompx_1234"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) static void baz2();
-// AST-NEXT: };
-// AST-NEXT: __attribute__((assume("ompx_1234"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) void baz() {
-// AST-NEXT:     BAZ<float> b;
-// AST-NEXT: }
-// AST-NEXT: __attribute__((assume("ompx_lambda_assumption"))) __attribute__((assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses"))) __attribute__((assume("omp_no_openmp"))) int lambda_outer() {
-// AST-NEXT:     auto lambda_inner = []() {
-// AST-NEXT:         return 42;
-// AST-NEXT:     };
-// AST-NEXT:     return lambda_inner();
-// AST-NEXT: }
+// AST{LITERAL}:      void foo() [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] {
+// AST-NEXT{LITERAL}: }
+// AST-NEXT{LITERAL}: class BAR {
+// AST-NEXT{LITERAL}: public:
+// AST-NEXT{LITERAL}:     [[omp::assume("ompx_range_bar_only")]] [[omp::assume("ompx_range_bar_only_2")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] BAR() {
+// AST-NEXT{LITERAL}:     }
+// AST-NEXT{LITERAL}:     [[omp::assume("ompx_range_bar_only")]] [[omp::assume("ompx_range_bar_only_2")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] void bar1() {
+// AST-NEXT{LITERAL}:     }
+// AST-NEXT{LITERAL}:     [[omp::assume("ompx_range_bar_only")]] [[omp::assume("ompx_range_bar_only_2")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] static void bar2() {
+// AST-NEXT{LITERAL}:     }
+// AST-NEXT{LITERAL}: };
+// AST-NEXT{LITERAL}: [[omp::assume("ompx_range_bar_only")]] [[omp::assume("ompx_range_bar_only_2")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] void bar() {
+// AST-NEXT{LITERAL}:     BAR b;
+// AST-NEXT{LITERAL}: }
+// AST-NEXT{LITERAL}: [[omp::assume("ompx_1234")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] void baz();
+// AST-NEXT{LITERAL}: template <typename T> class BAZ {
+// AST-NEXT{LITERAL}: public:
+// AST-NEXT{LITERAL}:     [[omp::assume("ompx_1234")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] BAZ<T>() {
+// AST-NEXT{LITERAL}:     }
+// AST-NEXT{LITERAL}:     [[omp::assume("ompx_1234")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] void baz1() {
+// AST-NEXT{LITERAL}:     }
+// AST-NEXT{LITERAL}:     [[omp::assume("ompx_1234")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] static void baz2() {
+// AST-NEXT{LITERAL}:     }
+// AST-NEXT{LITERAL}: };
+// AST-NEXT{LITERAL}: template<> class BAZ<float> {
+// AST-NEXT{LITERAL}: public:
+// AST-NEXT{LITERAL}:     [[omp::assume("ompx_1234")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] BAZ() {
+// AST-NEXT{LITERAL}:     }
+// AST-NEXT{LITERAL}:     [[omp::assume("ompx_1234")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] void baz1();
+// AST-NEXT{LITERAL}:     [[omp::assume("ompx_1234")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] static void baz2();
+// AST-NEXT{LITERAL}: };
+// AST-NEXT{LITERAL}: [[omp::assume("ompx_1234")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] void baz() {
+// AST-NEXT{LITERAL}:     BAZ<float> b;
+// AST-NEXT{LITERAL}: }
+// AST-NEXT{LITERAL}: [[omp::assume("ompx_lambda_assumption")]] [[omp::assume("omp_no_openmp_routines,ompx_another_warning,ompx_after_invalid_clauses")]] [[omp::assume("omp_no_openmp")]] int lambda_outer() {
+// AST-NEXT{LITERAL}:     auto lambda_inner = []() {
+// AST-NEXT{LITERAL}:         return 42;
+// AST-NEXT{LITERAL}:     };
+// AST-NEXT{LITERAL}:     return lambda_inner();
+// AST-NEXT{LITERAL}: }
 
 #endif
 
diff --git a/clang/test/OpenMP/assumes_print.cpp b/clang/test/OpenMP/assumes_print.cpp
index d8bdaaaf4518..9254c29ab833 100644
--- a/clang/test/OpenMP/assumes_print.cpp
+++ b/clang/test/OpenMP/assumes_print.cpp
@@ -37,8 +37,8 @@ void baz() {
 }
 #pragma omp end assumes
 
-// CHECK: void foo() __attribute__((assume("omp_no_openmp_routines"))) __attribute__((assume("omp_no_openmp")))
-// CHECK: __attribute__((assume("ompx_range_bar_only"))) __attribute__((assume("ompx_range_bar_only_2"))) __attribute__((assume("omp_no_openmp_routines"))) __attribute__((assume("omp_no_openmp"))) void bar()
-// CHECK: __attribute__((assume("ompx_1234"))) __attribute__((assume("omp_no_openmp_routines"))) __attribute__((assume("omp_no_openmp"))) void baz()
+// CHECK{LITERAL}: void foo() [[omp::assume("omp_no_openmp_routines")]] [[omp::assume("omp_no_openmp")]]
+// CHECK{LITERAL}: [[omp::assume("ompx_range_bar_only")]] [[omp::assume("ompx_range_bar_only_2")]] [[omp::assume("omp_no_openmp_routines")]] [[omp::assume("omp_no_openmp")]] void bar()
+// CHECK{LITERAL}: [[omp::assume("ompx_1234")]] [[omp::assume("omp_no_openmp_routines")]] [[omp::assume("omp_no_openmp")]] void baz()
 
 #endif
diff --git a/clang/test/OpenMP/assumes_template_print.cpp b/clang/test/OpenMP/assumes_template_print.cpp
index 614138b2ee0b..f8857ffadf78 100644
--- a/clang/test/OpenMP/assumes_template_print.cpp
+++ b/clang/test/OpenMP/assumes_template_print.cpp
@@ -17,7 +17,7 @@ template <typename T>
 struct S {
   int a;
 // CHECK: template <typename T> struct S {
-// CHECK:     void foo() __attribute__((assume("ompx_global_assumption")))     {
+// CHECK{LITERAL}:     void foo() [[omp::assume("ompx_global_assumption")]] {
   void foo() {
     #pragma omp parallel
     {}
@@ -25,15 +25,15 @@ struct S {
 };
 
 // CHECK: template<> struct S<int> {
-// CHECK:     void foo() __attribute__((assume("ompx_global_assumption")))     {
+// CHECK{LITERAL}:     void foo() [[omp::assume("ompx_global_assumption")]] {
 
 #pragma omp begin assumes no_openmp
-// CHECK: __attribute__((assume("omp_no_openmp"))) void S_with_assumes_no_call() __attribute__((assume("ompx_global_assumption"))) {
+// CHECK{LITERAL}: [[omp::assume("omp_no_openmp")]] void S_with_assumes_no_call() [[omp::assume("ompx_global_assumption")]] {
 void S_with_assumes_no_call() {
   S<int> s;
   s.a = 0;
 }
-// CHECK: __attribute__((assume("omp_no_openmp"))) void S_with_assumes_call() __attribute__((assume("ompx_global_assumption"))) {
+// CHECK{LITERAL}: [[omp::assume("omp_no_openmp")]] void S_with_assumes_call() [[omp::assume("ompx_global_assumption")]] {
 void S_with_assumes_call() {
   S<int> s;
   s.a = 0;
@@ -42,7 +42,7 @@ void S_with_assumes_call() {
 }
 #pragma omp end assumes
 
-// CHECK: void S_without_assumes() __attribute__((assume("ompx_global_assumption"))) {
+// CHECK{LITERAL}: void S_without_assumes() [[omp::assume("ompx_global_assumption")]] {
 void S_without_assumes() {
   S<int> s;
   s.foo();
@@ -54,7 +54,7 @@ void S_without_assumes() {
 template <typename T>
 struct P {
 // CHECK: template <typename T> struct P {
-// CHECK:    __attribute__((assume("ompx_global_assumption"))) void foo()      {
+// CHECK{LITERAL}:    [[omp::assume("ompx_global_assumption")]] void foo() {
   int a;
   void foo() {
     #pragma omp parallel
@@ -65,21 +65,21 @@ struct P {
 // TODO: Avoid the duplication here:
 
 // CHECK: template<> struct P<int> {
-// CHECK:      __attribute__((assume("ompx_global_assumption"))) __attribute__((assume("ompx_global_assumption"))) void foo()   {
+// CHECK{LITERAL}:     [[omp::assume("ompx_global_assumption")]] [[omp::assume("ompx_global_assumption")]] void foo() {
 
-// CHECK: __attribute__((assume("ompx_global_assumption"))) void P_without_assumes() {
+// CHECK{LITERAL}: [[omp::assume("ompx_global_assumption")]] void P_without_assumes() {
 void P_without_assumes() {
   P<int> p;
   p.foo();
 }
 
 #pragma omp begin assumes no_openmp
-// CHECK: __attribute__((assume("omp_no_openmp"))) __attribute__((assume("ompx_global_assumption"))) void P_with_assumes_no_call() {
+// CHECK{LITERAL}: [[omp::assume("omp_no_openmp")]] [[omp::assume("ompx_global_assumption")]] void P_with_assumes_no_call() {
 void P_with_assumes_no_call() {
   P<int> p;
   p.a = 0;
 }
-// CHECK: __attribute__((assume("omp_no_openmp"))) __attribute__((assume("ompx_global_assumption"))) void P_with_assumes_call() {
+// CHECK{LITERAL}: [[omp::assume("omp_no_openmp")]] [[omp::assume("ompx_global_assumption")]] void P_with_assumes_call() {
 void P_with_assumes_call() {
   P<int> p;
   p.a = 0;
diff --git a/clang/test/OpenMP/atomic_messages.c b/clang/test/OpenMP/atomic_messages.c
index 9f6662a9e136..f4e7db52494a 100644
--- a/clang/test/OpenMP/atomic_messages.c
+++ b/clang/test/OpenMP/atomic_messages.c
@@ -405,67 +405,67 @@ void compare(void) {
   int x = 0;
   int d = 0;
   int e = 0;
-// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected compound statement}}
 #pragma omp atomic compare
   {}
-// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected exactly one expression statement}}
 #pragma omp atomic compare
   {
     x = d;
     x = e;
   }
-// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected assignment statement}}
 #pragma omp atomic compare
   { x += d; }
-// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected assignment statement}}
 #pragma omp atomic compare
   { bbar(); }
-// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected conditional operator}}
 #pragma omp atomic compare
   { x = d; }
-// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect binary operator in conditional expression}}
 #pragma omp atomic compare
   { x = ffoo() ? e : x; }
-// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect '<', '>' or '==' as order operator}}
 #pragma omp atomic compare
   { x = x >= e ? e : x; }
-// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect comparison in a form of 'x == e', 'e == x', 'x ordop expr', or 'expr ordop x'}}
 #pragma omp atomic compare
   { x = d > e ? e : x; }
-// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect result value to be at false expression}}
 #pragma omp atomic compare
   { x = d > x ? e : d; }
-// omp51-error@+4 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+4 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+3 {{expect binary operator in conditional expression}}
 #pragma omp atomic compare
   {
     if (foo())
       x = d;
   }
-// omp51-error@+4 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+4 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+3 {{expect '<', '>' or '==' as order operator}}
 #pragma omp atomic compare
   {
     if (x >= d)
       x = d;
   }
-// omp51-error@+4 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+4 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+3 {{expect comparison in a form of 'x == e', 'e == x', 'x ordop expr', or 'expr ordop x'}}
 #pragma omp atomic compare
   {
     if (e > d)
       x = d;
   }
-// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected exactly one expression statement}}
 #pragma omp atomic compare
   {
@@ -473,7 +473,7 @@ void compare(void) {
       x = e;
     d = e;
   }
-// omp51-error@+7 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+7 {{the statement for 'atomic compare' must be a compound statement of form '{x = expr ordop x ? expr : x;}', '{x = x ordop expr? expr : x;}', '{x = x == e ? d : x;}', '{x = e == x ? d : x;}', or 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+6 {{unexpected 'else' statement}}
 #pragma omp atomic compare
   {
@@ -491,61 +491,61 @@ void compare_capture(void) {
   int v = 0;
   int r = 0;
   float dr = 0.0;
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected compound statement}}
 #pragma omp atomic compare capture
   if (x == e) {}
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected exactly one expression statement}}
 #pragma omp atomic compare capture
   if (x == e) {
     x = d;
     v = x;
   }
-// omp51-error@+4 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+4 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+3 {{expected assignment statement}}
 #pragma omp atomic compare capture
   if (x == e) {
     bbar();
   }
-// omp51-error@+4 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+4 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+3 {{expected assignment statement}}
 #pragma omp atomic compare capture
   if (x == e) {
     x += d;
   }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect binary operator in conditional expression}}
 #pragma omp atomic compare capture
   if (ffoo()) {
     x = d;
   }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect '==' operator}}
 #pragma omp atomic compare capture
   if (x > e) {
     x = d;
   }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect comparison in a form of 'x == e', 'e == x', 'x ordop expr', or 'expr ordop x'}}
 #pragma omp atomic compare capture
   if (d == e) {
     x = d;
   }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect 'else' statement}}
 #pragma omp atomic compare capture
   if (x == e) {
     x = d;
   }
-// omp51-error@+5 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+5 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+4 {{expected compound statement}}
 #pragma omp atomic compare capture
   if (x == e) {
     x = d;
   } else {
   }
-// omp51-error@+5 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+5 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+4 {{expected exactly one expression statement}}
 #pragma omp atomic compare capture
   if (x == e) {
@@ -554,7 +554,7 @@ void compare_capture(void) {
     v = x;
     d = e;
   }
-// omp51-error@+6 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+6 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+5 {{expected assignment statement}}
 #pragma omp atomic compare capture
   if (x == e) {
@@ -562,7 +562,7 @@ void compare_capture(void) {
   } else {
     bbar();
   }
-// omp51-error@+6 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+6 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+5 {{expected assignment statement}}
 #pragma omp atomic compare capture
   if (x == e) {
@@ -570,7 +570,7 @@ void compare_capture(void) {
   } else {
     v += x;
   }
-// omp51-error@+6 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+6 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+5 {{expect an assignment statement 'v = x'}}
 #pragma omp atomic compare capture
   if (x == e) {
@@ -578,35 +578,35 @@ void compare_capture(void) {
   } else {
     v = d;
   }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected compound statement}}
 #pragma omp atomic compare capture
   {}
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect a compound statement}}
 #pragma omp atomic compare capture
   x = x > e ? e : x;
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect a 'if' statement}}
 #pragma omp atomic compare capture
   { x = x > e ? e : x; }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect a form 'r = x == e; if (r) ...'}}
 #pragma omp atomic compare capture
   { r = x == e; if (x == d) { x = e; } }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected assignment statement}}
 #pragma omp atomic compare capture
   { r = x == e; if (r) { bbar(); } }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected assignment statement}}
 #pragma omp atomic compare capture
   { r = x == e; if (r) { x += d; } }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected compound statement}}
 #pragma omp atomic compare capture
   { r = x == e; if (r) {} }
-// omp51-error@+5 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+5 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+4 {{expected exactly one expression statement}}
 #pragma omp atomic compare capture
   {
@@ -616,19 +616,19 @@ void compare_capture(void) {
       v = x;
     }
   }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect '==' operator}}
 #pragma omp atomic compare capture
   { r = x > e; if (r) { x = d; } }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect comparison in a form of 'x == e', 'e == x', 'x ordop expr', or 'expr ordop x'}}
 #pragma omp atomic compare capture
   { r = d == e; if (r) { x = d; } }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected compound statement}}
 #pragma omp atomic compare capture
   { r = x == e; if (r) { x = d; } else {} }
-// omp51-error@+7 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+7 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+6 {{expected exactly one expression statement}}
 #pragma omp atomic compare capture
   {
@@ -640,40 +640,40 @@ void compare_capture(void) {
       d = e;
     }
   }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected assignment statement}}
 #pragma omp atomic compare capture
   { r = x == e; if (r) { x = d; } else { bbar(); } }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected assignment statement}}
 #pragma omp atomic compare capture
   { r = x == e; if (r) { x = d; } else { v += x; } }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect an assignment statement 'v = x'}}
 #pragma omp atomic compare capture
   { r = x == e; if (r) { x = d; } else { v = d; } }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected assignment statement}}
 #pragma omp atomic compare capture
   { v += x; if (x == e) { x = d; } }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected assignment statement}}
 #pragma omp atomic compare capture
   { if (x == e) { x = d; } v += x; }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect an assignment statement 'v = x'}}
 #pragma omp atomic compare capture
   { v = d; if (x == e) { x = d; } }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect an assignment statement 'v = x'}}
 #pragma omp atomic compare capture
   { if (x == e) { x = d; } v = d; }
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expected assignment statement}}
 #pragma omp atomic compare capture
   { v = x; bbar(); }
 
-// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'.}}
+// omp51-error@+3 {{the statement for 'atomic compare capture' must be a compound statement of form '{v = x; cond-up-stmt}', ''{cond-up-stmt v = x;}', '{if(x == e) {x = d;} else {v = x;}}', '{r = x == e; if(r) {x = d;}}', or '{r = x == e; if(r) {x = d;} else {v = x;}}', where 'cond-update-stmt' can have one of the following forms: 'if(expr ordop x) {x = expr;}', 'if(x ordop expr) {x = expr;}', 'if(x == e) {x = d;}', or 'if(e == x) {x = d;}' where 'x' is an lvalue expression with scalar type, 'expr', 'e', and 'd' are expressions with scalar type, and 'ordop' is one of '<' or '>'}}
 // omp51-note@+2 {{expect integer value}}
 #pragma omp atomic compare capture
   { dr = x == e; if (dr) { x = d; } }
diff --git a/clang/test/OpenMP/distribute_firstprivate_messages.cpp b/clang/test/OpenMP/distribute_firstprivate_messages.cpp
index 30fa8be519ef..f507c86b601f 100644
--- a/clang/test/OpenMP/distribute_firstprivate_messages.cpp
+++ b/clang/test/OpenMP/distribute_firstprivate_messages.cpp
@@ -95,7 +95,7 @@ int main(int argc, char **argv) {
   for (i = 0; i < argc; ++i) foo();
   #pragma omp target
   #pragma omp teams
-  #pragma omp distribute firstprivate (a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}} expected-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-error {{no matching constructor for initialization of 'S3'}}
+  #pragma omp distribute firstprivate (a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}} expected-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-error {{no matching constructor for initialization of 'S3'}}
   for (i = 0; i < argc; ++i) foo();
   #pragma omp target
   #pragma omp teams
@@ -103,11 +103,11 @@ int main(int argc, char **argv) {
   for (i = 0; i < argc; ++i) foo();
   #pragma omp target
   #pragma omp teams
-  #pragma omp distribute firstprivate(ba) // expected-warning {{Type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+  #pragma omp distribute firstprivate(ba) // expected-warning {{type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i) foo();
   #pragma omp target
   #pragma omp teams
-  #pragma omp distribute firstprivate(ca) // expected-error {{no matching constructor for initialization of 'S3'}} expected-warning {{Type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+  #pragma omp distribute firstprivate(ca) // expected-error {{no matching constructor for initialization of 'S3'}} expected-warning {{type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i) foo();
   #pragma omp target
   #pragma omp teams
diff --git a/clang/test/OpenMP/distribute_parallel_for_firstprivate_messages.cpp b/clang/test/OpenMP/distribute_parallel_for_firstprivate_messages.cpp
index 84d6337be34b..4bed1fe2c3a3 100644
--- a/clang/test/OpenMP/distribute_parallel_for_firstprivate_messages.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_firstprivate_messages.cpp
@@ -119,7 +119,7 @@ int foomain(int argc, char **argv) {
     ++k;
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for firstprivate(a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}} expected-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for firstprivate(a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}} expected-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp target
@@ -129,7 +129,7 @@ int foomain(int argc, char **argv) {
     ++k;
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp target
@@ -241,7 +241,7 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for firstprivate(a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for firstprivate(a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -256,12 +256,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for firstprivate(ba) // expected-warning {{Type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for firstprivate(ba) // expected-warning {{type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for firstprivate(ca) // expected-warning {{Type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for firstprivate(ca) // expected-warning {{type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -292,12 +292,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for firstprivate(m) // expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for firstprivate(m) // expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -329,13 +329,13 @@ int main(int argc, char **argv) {
 // expected-error@+3 {{lastprivate variable cannot be firstprivate}} expected-note@+3 {{defined as lastprivate}}
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for lastprivate(g) firstprivate(g) // expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for lastprivate(g) firstprivate(g) // expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 // expected-error@+3 {{lastprivate variable cannot be firstprivate}} expected-note@+3 {{defined as lastprivate}}
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for lastprivate(n) firstprivate(n) // expected-error {{calling a private constructor of class 'S6'}} expected-warning {{Type 'S6' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for lastprivate(n) firstprivate(n) // expected-error {{calling a private constructor of class 'S6'}} expected-warning {{type 'S6' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel
diff --git a/clang/test/OpenMP/distribute_parallel_for_lastprivate_messages.cpp b/clang/test/OpenMP/distribute_parallel_for_lastprivate_messages.cpp
index f403922e14e8..0a0962ef57c1 100644
--- a/clang/test/OpenMP/distribute_parallel_for_lastprivate_messages.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_lastprivate_messages.cpp
@@ -119,7 +119,7 @@ int foomain(int argc, char **argv) {
     ++k;
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for lastprivate(a, b) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for lastprivate(a, b) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp target
@@ -129,7 +129,7 @@ int foomain(int argc, char **argv) {
     ++k;
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}} expected-warning 2 {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}} expected-warning 2 {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp target
@@ -228,7 +228,7 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for lastprivate(a, b, c, d, f) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-error 1 {{const-qualified variable without mutable fields cannot be lastprivate}} expected-error 2 {{const-qualified variable cannot be lastprivate}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for lastprivate(a, b, c, d, f) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-error 1 {{const-qualified variable without mutable fields cannot be lastprivate}} expected-error 2 {{const-qualified variable cannot be lastprivate}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -243,12 +243,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for lastprivate(ba) // expected-warning {{Type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for lastprivate(ba) // expected-warning {{type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for lastprivate(ca) // expected-error {{const-qualified variable without mutable fields cannot be lastprivate}} expected-warning {{Type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for lastprivate(ca) // expected-error {{const-qualified variable without mutable fields cannot be lastprivate}} expected-warning {{type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -279,12 +279,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}} expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}} expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -325,13 +325,13 @@ int main(int argc, char **argv) {
 // expected-error@+3 {{firstprivate variable cannot be lastprivate}} expected-note@+3 {{defined as firstprivate}}
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for firstprivate(m) lastprivate(m) // expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for firstprivate(m) lastprivate(m) // expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 // expected-error@+3 {{lastprivate variable cannot be firstprivate}} expected-note@+3 {{defined as lastprivate}}
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for lastprivate(n) firstprivate(n) // expected-error {{calling a private constructor of class 'S6'}} expected-warning {{Type 'S6' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for lastprivate(n) firstprivate(n) // expected-error {{calling a private constructor of class 'S6'}} expected-warning {{type 'S6' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
   static int si;
diff --git a/clang/test/OpenMP/distribute_parallel_for_private_messages.cpp b/clang/test/OpenMP/distribute_parallel_for_private_messages.cpp
index d25598e46f81..2e0e75096a26 100644
--- a/clang/test/OpenMP/distribute_parallel_for_private_messages.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_private_messages.cpp
@@ -50,7 +50,7 @@ public:
 #pragma omp target
 #pragma omp teams
 #pragma omp distribute parallel for private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
-    for (int k = 0; k < s.a; ++k) // expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (int k = 0; k < s.a; ++k) // expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++s.a;
     return *this;
   }
diff --git a/clang/test/OpenMP/distribute_parallel_for_reduction_messages.cpp b/clang/test/OpenMP/distribute_parallel_for_reduction_messages.cpp
index 6b3d9da9a3a6..864fb597214b 100644
--- a/clang/test/OpenMP/distribute_parallel_for_reduction_messages.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_reduction_messages.cpp
@@ -187,7 +187,7 @@ T tmain(T argc) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}} expected-warning 2 {{Type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning 2 {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}} expected-warning 2 {{type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning 2 {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
@@ -232,7 +232,7 @@ T tmain(T argc) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}} expected-warning 2 {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}} expected-warning 2 {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
@@ -371,12 +371,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{Type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{Type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
@@ -386,12 +386,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for reduction(+ : ba) // expected-error {{const-qualified variable cannot be reduction}} expected-warning {{Type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for reduction(+ : ba) // expected-error {{const-qualified variable cannot be reduction}} expected-warning {{type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for reduction(* : ca) // expected-error {{const-qualified variable cannot be reduction}} expected-warning {{Type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for reduction(* : ca) // expected-error {{const-qualified variable cannot be reduction}} expected-warning {{type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
@@ -416,12 +416,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}} expected-warning {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}} expected-warning {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}} expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}} expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
@@ -437,12 +437,12 @@ int main(int argc, char **argv) {
 #pragma omp parallel private(k)
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}} expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}} expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for reduction(+ : p), reduction(+ : p) // expected-error {{variable can appear only once in OpenMP 'reduction' clause}} expected-note {{previously referenced here}} expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for reduction(+ : p), reduction(+ : p) // expected-error {{variable can appear only once in OpenMP 'reduction' clause}} expected-note {{previously referenced here}} expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_private_messages.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_private_messages.cpp
index 43bc6ad8e637..0cb8c01625db 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_private_messages.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_private_messages.cpp
@@ -50,7 +50,7 @@ public:
 #pragma omp target
 #pragma omp teams
 #pragma omp distribute parallel for simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
-    for (int k = 0; k < s.a; ++k) // expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (int k = 0; k < s.a; ++k) // expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++s.a;
     return *this;
   }
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_shared_messages.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_shared_messages.cpp
index 7c83e4c674c6..6dc6e777fb33 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_shared_messages.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_shared_messages.cpp
@@ -117,7 +117,7 @@ T tmain(T argc, S **argv) {
 
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for simd shared (a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for simd shared (a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for(int k = 0 ; k < n ; k++) {
     acc++;
   }
@@ -131,14 +131,14 @@ T tmain(T argc, S **argv) {
 
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for simd shared(ba) // expected-warning {{Type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for simd shared(ba) // expected-warning {{type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for(int k = 0 ; k < n ; k++) {
     acc++;
   }
 
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for simd shared(ca) // expected-warning {{Type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for simd shared(ca) // expected-warning {{type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for(int k = 0 ; k < n ; k++) {
     acc++;
   }
@@ -152,7 +152,7 @@ T tmain(T argc, S **argv) {
 
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for simd shared(e, g) // expected-warning {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for simd shared(e, g) // expected-warning {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
   for(int k = 0 ; k < n ; k++) {
     acc++;
   }
@@ -291,7 +291,7 @@ int main(int argc, char **argv) {
 
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for simd shared (a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for simd shared (a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for(int k = 0 ; k < n ; k++) {
     acc++;
   }
@@ -305,14 +305,14 @@ int main(int argc, char **argv) {
 
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for simd shared(ba) // expected-warning {{Type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for simd shared(ba) // expected-warning {{type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for(int k = 0 ; k < n ; k++) {
     acc++;
   }
 
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for simd shared(ca) // expected-warning {{Type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for simd shared(ca) // expected-warning {{type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for(int k = 0 ; k < n ; k++) {
     acc++;
   }
@@ -326,7 +326,7 @@ int main(int argc, char **argv) {
 
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute parallel for simd shared(e, g) // expected-warning {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute parallel for simd shared(e, g) // expected-warning {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
   for(int k = 0 ; k < n ; k++) {
     acc++;
   }
diff --git a/clang/test/OpenMP/distribute_simd_firstprivate_messages.cpp b/clang/test/OpenMP/distribute_simd_firstprivate_messages.cpp
index 43057fe5bacc..bc1dfcfe7ab4 100644
--- a/clang/test/OpenMP/distribute_simd_firstprivate_messages.cpp
+++ b/clang/test/OpenMP/distribute_simd_firstprivate_messages.cpp
@@ -111,7 +111,7 @@ int foomain(int argc, char **argv) {
     ++k;
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd firstprivate(z, a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}} expected-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd firstprivate(z, a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}} expected-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp target
@@ -121,7 +121,7 @@ int foomain(int argc, char **argv) {
     ++k;
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp target
@@ -233,7 +233,7 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd firstprivate(a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd firstprivate(a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -248,12 +248,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd firstprivate(ba) // expected-warning {{Type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd firstprivate(ba) // expected-warning {{type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd firstprivate(ca) // expected-warning {{Type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd firstprivate(ca) // expected-warning {{type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -284,12 +284,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd firstprivate(m) // expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd firstprivate(m) // expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -321,13 +321,13 @@ int main(int argc, char **argv) {
 // expected-error@+3 {{lastprivate variable cannot be firstprivate}} expected-note@+3 {{defined as lastprivate}}
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd lastprivate(g) firstprivate(g) //expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd lastprivate(g) firstprivate(g) //expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 // expected-error@+3 {{lastprivate variable cannot be firstprivate}} expected-note@+3 {{defined as lastprivate}}
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd lastprivate(n) firstprivate(n) // expected-error {{calling a private constructor of class 'S6'}} expected-warning {{Type 'S6' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd lastprivate(n) firstprivate(n) // expected-error {{calling a private constructor of class 'S6'}} expected-warning {{type 'S6' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp parallel
diff --git a/clang/test/OpenMP/distribute_simd_lastprivate_messages.cpp b/clang/test/OpenMP/distribute_simd_lastprivate_messages.cpp
index 7658288242ab..379f57547498 100644
--- a/clang/test/OpenMP/distribute_simd_lastprivate_messages.cpp
+++ b/clang/test/OpenMP/distribute_simd_lastprivate_messages.cpp
@@ -120,7 +120,7 @@ int foomain(int argc, char **argv) {
     ++k;
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd lastprivate(a, b) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd lastprivate(a, b) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp target
@@ -130,7 +130,7 @@ int foomain(int argc, char **argv) {
     ++k;
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}} expected-warning 2 {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}} expected-warning 2 {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int k = 0; k < argc; ++k)
     ++k;
 #pragma omp target
@@ -229,7 +229,7 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd lastprivate(a, b, c, d, f) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-error 1 {{const-qualified variable without mutable fields cannot be lastprivate}} expected-error 2 {{const-qualified variable cannot be lastprivate}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd lastprivate(a, b, c, d, f) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-error 1 {{const-qualified variable without mutable fields cannot be lastprivate}} expected-error 2 {{const-qualified variable cannot be lastprivate}} expected-error {{incomplete type 'S1' where a complete type is required}} expected-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -244,12 +244,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd lastprivate(ba) // expected-warning {{Type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd lastprivate(ba) // expected-warning {{type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd lastprivate(ca) // expected-error {{const-qualified variable without mutable fields cannot be lastprivate}} expected-warning {{Type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd lastprivate(ca) // expected-error {{const-qualified variable without mutable fields cannot be lastprivate}} expected-warning {{type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -280,12 +280,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-warning {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}} expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}} expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 #pragma omp target
@@ -326,13 +326,13 @@ int main(int argc, char **argv) {
 // expected-error@+3 {{firstprivate variable cannot be lastprivate}} expected-note@+3 {{defined as firstprivate}}
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd firstprivate(m) lastprivate(m) // expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd firstprivate(m) lastprivate(m) // expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
 // expected-error@+3 {{lastprivate variable cannot be firstprivate}} expected-note@+3 {{defined as lastprivate}}
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd lastprivate(n) firstprivate(n) // expected-error {{calling a private constructor of class 'S6'}} expected-warning {{Type 'S6' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd lastprivate(n) firstprivate(n) // expected-error {{calling a private constructor of class 'S6'}} expected-warning {{type 'S6' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i)
     foo();
   static int si;
diff --git a/clang/test/OpenMP/distribute_simd_loop_messages.cpp b/clang/test/OpenMP/distribute_simd_loop_messages.cpp
index 5a55f9569b8d..e56c7dfbddab 100644
--- a/clang/test/OpenMP/distribute_simd_loop_messages.cpp
+++ b/clang/test/OpenMP/distribute_simd_loop_messages.cpp
@@ -14,7 +14,7 @@ public:
 #pragma omp target
 #pragma omp teams
 #pragma omp distribute simd
-    for (int k = 0; k < s.a; ++k) // expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (int k = 0; k < s.a; ++k) // expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++s.a;
     return *this;
   }
@@ -490,7 +490,7 @@ int test_with_random_access_iterator() {
   #pragma omp target
   #pragma omp teams
   #pragma omp distribute simd
-  for (GoodIter I = begin; I < end; ++I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I < end; ++I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   #pragma omp target
   #pragma omp teams
@@ -501,41 +501,41 @@ int test_with_random_access_iterator() {
   #pragma omp target
   #pragma omp teams
   #pragma omp distribute simd
-  for (GoodIter I = begin; I >= end; --I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I >= end; --I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   #pragma omp target
   #pragma omp teams
   // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
   #pragma omp distribute simd
-  for (GoodIter I(begin); I < end; ++I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(begin); I < end; ++I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   #pragma omp target
   #pragma omp teams
   // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
   #pragma omp distribute simd
-  for (GoodIter I(nullptr); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(nullptr); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   #pragma omp target
   #pragma omp teams
   // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
   #pragma omp distribute simd
-  for (GoodIter I(0); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(0); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   #pragma omp target
   #pragma omp teams
   // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
   #pragma omp distribute simd
-  for (GoodIter I(1,2); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(1,2); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   #pragma omp target
   #pragma omp teams
   #pragma omp distribute simd
-  for (begin = GoodIter(0); begin < end; ++begin) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (begin = GoodIter(0); begin < end; ++begin) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++begin;
   #pragma omp target
   #pragma omp teams
   #pragma omp distribute simd
-  for (begin = GoodIter(1,2); begin < end; ++begin) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (begin = GoodIter(1,2); begin < end; ++begin) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++begin;
   #pragma omp target
   #pragma omp teams
@@ -546,7 +546,7 @@ int test_with_random_access_iterator() {
   #pragma omp target
   #pragma omp teams
   #pragma omp distribute simd
-  for (begin = end; begin < end; ++begin) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (begin = end; begin < end; ++begin) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++begin;
   #pragma omp target
   #pragma omp teams
@@ -576,7 +576,7 @@ int test_with_random_access_iterator() {
   #pragma omp target
   #pragma omp teams
   #pragma omp distribute simd
-  for (GoodIter I = begin; I >= end; I = I - 1) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I >= end; I = I - 1) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   #pragma omp target
   #pragma omp teams
@@ -600,7 +600,7 @@ int test_with_random_access_iterator() {
   #pragma omp target
   #pragma omp teams
   #pragma omp distribute simd
-  for (Iter0 I = begin0; I < end0; ++I) // expected-warning 2 {{Type 'Iter0' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (Iter0 I = begin0; I < end0; ++I) // expected-warning 2 {{type 'Iter0' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 
   #pragma omp target
@@ -608,7 +608,7 @@ int test_with_random_access_iterator() {
   // Initializer is constructor without params.
   // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
   #pragma omp distribute simd
-  for (Iter0 I; I < end0; ++I) // expected-warning {{Type 'Iter0' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (Iter0 I; I < end0; ++I) // expected-warning {{type 'Iter0' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 
   Iter1 begin1, end1;
@@ -654,7 +654,7 @@ template <typename IT, int ST> class TC {
       // expected-note@+3 {{loop step is expected to be positive due to this condition}}
       // expected-error@+2 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
       #pragma omp distribute simd
-      for (IT I = begin; I <= end; I += ST) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+      for (IT I = begin; I <= end; I += ST) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
         ++I;
       }
       #pragma omp distribute simd
@@ -697,7 +697,7 @@ template <typename IT, int ST=0> int dotest_gt(IT begin, IT end) {
   #pragma omp target
   #pragma omp teams
   #pragma omp distribute simd
-  for (IT I = begin; I < end; I+=TC<int,ST>::step()) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (IT I = begin; I < end; I+=TC<int,ST>::step()) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   }
 }
diff --git a/clang/test/OpenMP/distribute_simd_private_messages.cpp b/clang/test/OpenMP/distribute_simd_private_messages.cpp
index 261a46ac6099..8be71938e0fa 100644
--- a/clang/test/OpenMP/distribute_simd_private_messages.cpp
+++ b/clang/test/OpenMP/distribute_simd_private_messages.cpp
@@ -50,7 +50,7 @@ public:
 #pragma omp target
 #pragma omp teams
 #pragma omp distribute simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
-    for (int k = 0; k < s.a; ++k) // expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (int k = 0; k < s.a; ++k) // expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++s.a;
     return *this;
   }
diff --git a/clang/test/OpenMP/distribute_simd_reduction_messages.cpp b/clang/test/OpenMP/distribute_simd_reduction_messages.cpp
index d27360ac9b2c..03b6ee5f4a25 100644
--- a/clang/test/OpenMP/distribute_simd_reduction_messages.cpp
+++ b/clang/test/OpenMP/distribute_simd_reduction_messages.cpp
@@ -187,7 +187,7 @@ T tmain(T argc) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}} expected-warning 2 {{Type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning 2 {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified variable cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}} expected-warning 2 {{type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning 2 {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
@@ -232,7 +232,7 @@ T tmain(T argc) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}} expected-warning 2 {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}} expected-warning 2 {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
@@ -376,12 +376,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}} expected-warning {{Type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}} expected-error {{incomplete type 'S1' where a complete type is required}}
+#pragma omp distribute simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}} expected-warning {{type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}} expected-error {{incomplete type 'S1' where a complete type is required}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-warning {{Type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}} expected-error {{incomplete type 'S1' where a complete type is required}}
+#pragma omp distribute simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified variable cannot be reduction}} expected-warning {{type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}} expected-error {{incomplete type 'S1' where a complete type is required}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
@@ -391,12 +391,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd reduction(+ : ba) // expected-error {{const-qualified variable cannot be reduction}} expected-warning {{Type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd reduction(+ : ba) // expected-error {{const-qualified variable cannot be reduction}} expected-warning {{type 'const S2[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd reduction(* : ca) // expected-error {{const-qualified variable cannot be reduction}} expected-warning {{Type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd reduction(* : ca) // expected-error {{const-qualified variable cannot be reduction}} expected-warning {{type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
@@ -421,12 +421,12 @@ int main(int argc, char **argv) {
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}} expected-warning {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}}
+#pragma omp distribute simd reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}} expected-warning {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
 #pragma omp teams
-#pragma omp distribute simd reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}} expected-warning {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp distribute simd reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}} expected-warning {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp target
@@ -440,7 +440,7 @@ int main(int argc, char **argv) {
   for (int i = 0; i < 10; ++i)
     foo();
 #if __cplusplus < 201103L // < C++11
-// expected-warning@+5 {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+// expected-warning@+5 {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
 #endif
 #pragma omp parallel private(k)
 #pragma omp target
@@ -449,7 +449,7 @@ int main(int argc, char **argv) {
   for (int i = 0; i < 10; ++i)
     foo();
 #if __cplusplus < 201103L // < C++11
-// expected-warning@+4 {{Type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+// expected-warning@+4 {{type 'S3' is not trivially copyable and not guaranteed to be mapped correctly}}
 #endif
 #pragma omp target
 #pragma omp teams
diff --git a/clang/test/OpenMP/reduction_implicit_map.cpp b/clang/test/OpenMP/reduction_implicit_map.cpp
index 0f67cdc56ddc..765e90bcba85 100644
--- a/clang/test/OpenMP/reduction_implicit_map.cpp
+++ b/clang/test/OpenMP/reduction_implicit_map.cpp
@@ -47,7 +47,7 @@ int bar() {
  S2 o[5];
   //warnig "copyable and not guaranteed to be mapped correctly" and
   //implicit map generated.
-#pragma omp target parallel reduction(+:o[0]) //expected-warning {{Type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp target parallel reduction(+:o[0]) //expected-warning {{type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 10; i++);
   double b[10][10][10];
   //no error no implicit map generated, the map for b is generated but not
diff --git a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
index 2f829d2ad094..1afedc6683f8 100644
--- a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
+++ b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
@@ -4,7 +4,7 @@
 
 // host-no-diagnostics
 
-void baz(void) __attribute__((assume("omp_no_openmp")));
+[[omp::assume("omp_no_openmp")]] void baz(void);
 
 void bar1(void) {
 #pragma omp parallel // #0
@@ -24,7 +24,7 @@ void foo1(void) {
                          // all-remark@#2 {{Rewriting generic-mode kernel with a customized state machine. [OMP131]}}
 
   {
-    baz();           // all-remark {{Value has potential side effects preventing SPMD-mode execution. Add `__attribute__((assume("ompx_spmd_amenable")))` to the called function to override. [OMP121]}}
+    baz();           // all-remark {{Value has potential side effects preventing SPMD-mode execution. Add `[[omp::assume("ompx_spmd_amenable")]]` to the called function to override. [OMP121]}}
 #pragma omp parallel // #3
     {
     }
@@ -39,7 +39,7 @@ void foo2(void) {
 #pragma omp target teams // #5
                          // all-remark@#5 {{Rewriting generic-mode kernel with a customized state machine. [OMP131]}}
   {
-    baz();           // all-remark {{Value has potential side effects preventing SPMD-mode execution. Add `__attribute__((assume("ompx_spmd_amenable")))` to the called function to override. [OMP121]}}
+    baz();           // all-remark {{Value has potential side effects preventing SPMD-mode execution. Add `[[omp::assume("ompx_spmd_amenable")]]` to the called function to override. [OMP121]}}
 #pragma omp parallel // #6
     {
     }
@@ -57,7 +57,7 @@ void foo3(void) {
 #pragma omp target teams // #8
                          // all-remark@#8 {{Rewriting generic-mode kernel with a customized state machine. [OMP131]}}
   {
-    baz();           // all-remark {{Value has potential side effects preventing SPMD-mode execution. Add `__attribute__((assume("ompx_spmd_amenable")))` to the called function to override. [OMP121]}}
+    baz();           // all-remark {{Value has potential side effects preventing SPMD-mode execution. Add `[[omp::assume("ompx_spmd_amenable")]]` to the called function to override. [OMP121]}}
 #pragma omp parallel // #9
     {
     }
diff --git a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
index c48a4b966077..5ce8f1fa4046 100644
--- a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
+++ b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
@@ -3,7 +3,7 @@
 
 // host-no-diagnostics
 
-void baz(void) __attribute__((assume("omp_no_openmp")));
+[[omp::assume("omp_no_openmp")]] void baz(void);
 
 void bar(void) {
 #pragma omp parallel // #1                                                                                                                                                                                                                                                                                                                                           \
@@ -16,7 +16,7 @@ void foo(void) {
 #pragma omp target teams // #2
                          // expected-remark@#2 {{Rewriting generic-mode kernel with a customized state machine. [OMP131]}}
   {
-    baz();               // expected-remark {{Value has potential side effects preventing SPMD-mode execution. Add `__attribute__((assume("ompx_spmd_amenable")))` to the called function to override. [OMP121]}}
+    baz();               // expected-remark {{Value has potential side effects preventing SPMD-mode execution. Add `[[omp::assume("ompx_spmd_amenable")]]` to the called function to override. [OMP121]}}
 #pragma omp parallel
     {
     }
diff --git a/clang/test/OpenMP/requires_default_atomic_mem_order_messages.cpp b/clang/test/OpenMP/requires_default_atomic_mem_order_messages.cpp
index 19f6ede043d8..5160fbbfb4a7 100644
--- a/clang/test/OpenMP/requires_default_atomic_mem_order_messages.cpp
+++ b/clang/test/OpenMP/requires_default_atomic_mem_order_messages.cpp
@@ -7,6 +7,6 @@ void foo2() {
 }
 
 #pragma omp requires atomic_default_mem_order(seq_cst) // expected-error {{'atomic' region encountered before requires directive with 'atomic_default_mem_order' clause}} expected-note 2 {{atomic_default_mem_order clause previously used here}}
-#pragma omp requires atomic_default_mem_order(acq_rel) // expected-error {{'atomic' region encountered before requires directive with 'atomic_default_mem_order' clause}} expected-error {{Only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
-#pragma omp requires atomic_default_mem_order(relaxed) // expected-error {{'atomic' region encountered before requires directive with 'atomic_default_mem_order' clause}} expected-error {{Only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
+#pragma omp requires atomic_default_mem_order(acq_rel) // expected-error {{'atomic' region encountered before requires directive with 'atomic_default_mem_order' clause}} expected-error {{only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
+#pragma omp requires atomic_default_mem_order(relaxed) // expected-error {{'atomic' region encountered before requires directive with 'atomic_default_mem_order' clause}} expected-error {{only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
 #pragma omp requires atomic_default_mem_order(release) // expected-error {{expected 'seq_cst', 'acq_rel' or 'relaxed' in OpenMP clause 'atomic_default_mem_order'}} expected-error {{expected at least one clause on '#pragma omp requires' directive}}
diff --git a/clang/test/OpenMP/requires_messages.cpp b/clang/test/OpenMP/requires_messages.cpp
index 10d311631b10..dbb2b317067b 100644
--- a/clang/test/OpenMP/requires_messages.cpp
+++ b/clang/test/OpenMP/requires_messages.cpp
@@ -6,39 +6,39 @@ int a;
 
 #pragma omp requires unified_shared_memory // rev-note {{unified_shared_memory clause previously used here}} expected-note{{unified_shared_memory clause previously used here}}
 
-#pragma omp requires unified_shared_memory, unified_shared_memory // expected-error {{Only one unified_shared_memory clause can appear on a requires directive in a single translation unit}} expected-error {{directive '#pragma omp requires' cannot contain more than one 'unified_shared_memory' clause}}
+#pragma omp requires unified_shared_memory, unified_shared_memory // expected-error {{only one unified_shared_memory clause can appear on a requires directive in a single translation unit}} expected-error {{directive '#pragma omp requires' cannot contain more than one 'unified_shared_memory' clause}}
 
-#pragma omp requires unified_address // expected-error {{Only one unified_address clause can appear on a requires directive in a single translation unit}}
+#pragma omp requires unified_address // expected-error {{only one unified_address clause can appear on a requires directive in a single translation unit}}
 
-#pragma omp requires unified_address, unified_address // expected-error {{Only one unified_address clause can appear on a requires directive in a single translation unit}} expected-error {{directive '#pragma omp requires' cannot contain more than one 'unified_address' clause}}
+#pragma omp requires unified_address, unified_address // expected-error {{only one unified_address clause can appear on a requires directive in a single translation unit}} expected-error {{directive '#pragma omp requires' cannot contain more than one 'unified_address' clause}}
 
 #ifdef OMP99
 #pragma omp requires reverse_offload // rev-note {{reverse_offload clause previously used here}} rev-note {{reverse_offload clause previously used here}}
 
-#pragma omp requires reverse_offload, reverse_offload // rev-error {{Only one reverse_offload clause can appear on a requires directive in a single translation unit}} rev-error {{directive '#pragma omp requires' cannot contain more than one 'reverse_offload' clause}}
+#pragma omp requires reverse_offload, reverse_offload // rev-error {{only one reverse_offload clause can appear on a requires directive in a single translation unit}} rev-error {{directive '#pragma omp requires' cannot contain more than one 'reverse_offload' clause}}
 #endif
 
 #pragma omp requires dynamic_allocators // rev-note {{dynamic_allocators clause previously used here}} expected-note {{dynamic_allocators clause previously used here}}
 
-#pragma omp requires dynamic_allocators, dynamic_allocators // expected-error {{Only one dynamic_allocators clause can appear on a requires directive in a single translation unit}} expected-error {{directive '#pragma omp requires' cannot contain more than one 'dynamic_allocators' clause}}
+#pragma omp requires dynamic_allocators, dynamic_allocators // expected-error {{only one dynamic_allocators clause can appear on a requires directive in a single translation unit}} expected-error {{directive '#pragma omp requires' cannot contain more than one 'dynamic_allocators' clause}}
 
 #pragma omp requires atomic_default_mem_order(seq_cst) // rev-note {{atomic_default_mem_order clause previously used here}} expected-note {{atomic_default_mem_order clause previously used here}} expected-note {{atomic_default_mem_order clause previously used here}} expected-note {{atomic_default_mem_order clause previously used here}} expected-note {{atomic_default_mem_order clause previously used here}}
 
-#pragma omp requires atomic_default_mem_order(acq_rel) // expected-error {{Only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
+#pragma omp requires atomic_default_mem_order(acq_rel) // expected-error {{only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
 
-#pragma omp requires atomic_default_mem_order(relaxed) // expected-error {{Only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
+#pragma omp requires atomic_default_mem_order(relaxed) // expected-error {{only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
 
 #pragma omp requires atomic_default_mem_order // expected-error {{expected '(' after 'atomic_default_mem_order'}} expected-error {{expected at least one clause on '#pragma omp requires' directive}}
 
 #pragma omp requires atomic_default_mem_order( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected 'seq_cst', 'acq_rel' or 'relaxed' in OpenMP clause 'atomic_default_mem_order'}} expected-error {{expected at least one clause on '#pragma omp requires' directive}}
 
-#pragma omp requires atomic_default_mem_order(seq_cst // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{Only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
+#pragma omp requires atomic_default_mem_order(seq_cst // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
 
 #pragma omp requires atomic_default_mem_order(invalid_modifier) // expected-error {{expected 'seq_cst', 'acq_rel' or 'relaxed' in OpenMP clause 'atomic_default_mem_order'}} expected-error {{expected at least one clause on '#pragma omp requires' directive}}
 
 #pragma omp requires atomic_default_mem_order(shared) // expected-error {{expected 'seq_cst', 'acq_rel' or 'relaxed' in OpenMP clause 'atomic_default_mem_order'}} expected-error {{expected at least one clause on '#pragma omp requires' directive}}
 
-#pragma omp requires atomic_default_mem_order(acq_rel), atomic_default_mem_order(relaxed) // expected-error {{directive '#pragma omp requires' cannot contain more than one 'atomic_default_mem_order' claus}} expected-error {{Only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
+#pragma omp requires atomic_default_mem_order(acq_rel), atomic_default_mem_order(relaxed) // expected-error {{directive '#pragma omp requires' cannot contain more than one 'atomic_default_mem_order' claus}} expected-error {{only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
 
 #pragma omp requires // expected-error {{expected at least one clause on '#pragma omp requires' directive}}
 
@@ -46,18 +46,18 @@ int a;
 
 #pragma omp requires nowait // expected-error {{unexpected OpenMP clause 'nowait' in directive '#pragma omp requires'}} expected-error {{expected at least one clause on '#pragma omp requires' directive}}
 
-#pragma omp requires unified_address, invalid_clause // expected-warning {{extra tokens at the end of '#pragma omp requires' are ignored}} expected-error {{Only one unified_address clause can appear on a requires directive in a single translation unit}}
+#pragma omp requires unified_address, invalid_clause // expected-warning {{extra tokens at the end of '#pragma omp requires' are ignored}} expected-error {{only one unified_address clause can appear on a requires directive in a single translation unit}}
 
 #pragma omp requires invalid_clause unified_address // expected-warning {{extra tokens at the end of '#pragma omp requires' are ignored}} expected-error {{expected at least one clause on '#pragma omp requires' directive}}
 
 #ifdef OMP99
-#pragma omp requires unified_shared_memory, unified_address, reverse_offload, dynamic_allocators, atomic_default_mem_order(seq_cst) // rev-error {{Only one unified_shared_memory clause can appear on a requires directive in a single translation unit}} rev-error{{Only one unified_address clause can appear on a requires directive in a single translation unit}} rev-error{{Only one reverse_offload clause can appear on a requires directive in a single translation unit}} rev-error{{Only one dynamic_allocators clause can appear on a requires directive in a single translation unit}} rev-error {{Only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
+#pragma omp requires unified_shared_memory, unified_address, reverse_offload, dynamic_allocators, atomic_default_mem_order(seq_cst) // rev-error {{only one unified_shared_memory clause can appear on a requires directive in a single translation unit}} rev-error{{only one unified_address clause can appear on a requires directive in a single translation unit}} rev-error{{only one reverse_offload clause can appear on a requires directive in a single translation unit}} rev-error{{only one dynamic_allocators clause can appear on a requires directive in a single translation unit}} rev-error {{only one atomic_default_mem_order clause can appear on a requires directive in a single translation unit}}
 #endif
 
 namespace A {
-  #pragma omp requires unified_address // expected-error {{Only one unified_address clause can appear on a requires directive in a single translation unit}}
+  #pragma omp requires unified_address // expected-error {{only one unified_address clause can appear on a requires directive in a single translation unit}}
   namespace B {
-    #pragma omp requires unified_address // expected-error {{Only one unified_address clause can appear on a requires directive in a single translation unit}}
+    #pragma omp requires unified_address // expected-error {{only one unified_address clause can appear on a requires directive in a single translation unit}}
   }
 }
 
diff --git a/clang/test/OpenMP/target_device_ancestor_messages.cpp b/clang/test/OpenMP/target_device_ancestor_messages.cpp
index bc1d668d1914..e6705b369c70 100644
--- a/clang/test/OpenMP/target_device_ancestor_messages.cpp
+++ b/clang/test/OpenMP/target_device_ancestor_messages.cpp
@@ -2,6 +2,6 @@
 // RUN: %clang_cc1 -triple=x86_64 -verify -fopenmp-simd -fopenmp-targets=x86_64 -x c++ -fexceptions -fcxx-exceptions %s
 
 void bar() {
-#pragma omp target device(ancestor : 1) // expected-error {{Device clause with ancestor device-modifier used without specifying 'requires reverse_offload'}}
+#pragma omp target device(ancestor : 1) // expected-error {{device clause with ancestor device-modifier used without specifying 'requires reverse_offload'}}
   ;
 }
diff --git a/clang/test/OpenMP/target_firstprivate_messages.cpp b/clang/test/OpenMP/target_firstprivate_messages.cpp
index 9b211297f531..2eafb367c0c4 100644
--- a/clang/test/OpenMP/target_firstprivate_messages.cpp
+++ b/clang/test/OpenMP/target_firstprivate_messages.cpp
@@ -56,7 +56,7 @@ public:
   S5(int v) : a(v) {}
   S5 &operator=(S5 &s) {
 #pragma omp target firstprivate(a) firstprivate(this->a) firstprivate(s.a) // expected-error {{expected variable name or data member of current class}}
-    for (int k = 0; k < s.a; ++k) // expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (int k = 0; k < s.a; ++k) // expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++s.a;
     return *this;
   }
diff --git a/clang/test/OpenMP/target_map_messages.cpp b/clang/test/OpenMP/target_map_messages.cpp
index 3bd432b47e63..10f46687d637 100644
--- a/clang/test/OpenMP/target_map_messages.cpp
+++ b/clang/test/OpenMP/target_map_messages.cpp
@@ -681,13 +681,13 @@ T tmain(T argc) {
 #pragma omp target data map(tofrom: argc > 0 ? x : y) // lt50-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}} ge50-error 2 {{expected addressable lvalue in 'map' clause}}
 #pragma omp target data map(argc)
 #pragma omp target data map(S1) // expected-error {{'S1' does not refer to a value}}
-#pragma omp target data map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} warn-warning 2 {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} warn-warning 2 {{Type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
-#pragma omp target data map(ba) // warn-warning 2 {{Type 'const S2 [5]' is not trivially copyable and not guaranteed to be mapped correctly}}
-#pragma omp target data map(ca) // warn-warning 2 {{Type 'const S3 [5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp target data map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} warn-warning 2 {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} warn-warning 2 {{type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp target data map(ba) // warn-warning 2 {{type 'const S2 [5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp target data map(ca) // warn-warning 2 {{type 'const S3 [5]' is not trivially copyable and not guaranteed to be mapped correctly}}
 #pragma omp target data map(da)
 #pragma omp target data map(S2::S2s)
 #pragma omp target data map(S2::S2sc)
-#pragma omp target data map(e, g) // warn-warning 2 {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} warn-warning 2 {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp target data map(e, g) // warn-warning 2 {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} warn-warning 2 {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
 #pragma omp target data map(h) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
 #pragma omp target data map(k) map(k) // lt50-error 2 {{variable already marked as mapped in current construct}} lt50-note 2 {{used here}}
 #pragma omp target map(k), map(k[:5]) // lt50-error 2 {{pointer cannot be mapped along with a section derived from itself}} lt50-note 2 {{used here}}
@@ -815,14 +815,14 @@ int main(int argc, char **argv) {
 #pragma omp target data map(tofrom: argc > 0 ? argv[1] : argv[2]) // lt50-error {{expected expression containing only member accesses and/or array sections based on named variables}} ge50-error {{expected addressable lvalue in 'map' clause}}
 #pragma omp target data map(argc)
 #pragma omp target data map(S1) // expected-error {{'S1' does not refer to a value}}
-#pragma omp target data map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} warn-warning {{Type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} warn-warning {{Type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp target data map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} warn-warning {{type 'const S2' is not trivially copyable and not guaranteed to be mapped correctly}} warn-warning {{type 'const S3' is not trivially copyable and not guaranteed to be mapped correctly}}
 #pragma omp target data map(argv[1])
-#pragma omp target data map(ba) // warn-warning {{Type 'const S2 [5]' is not trivially copyable and not guaranteed to be mapped correctly}}
-#pragma omp target data map(ca) // warn-warning {{Type 'const S3 [5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp target data map(ba) // warn-warning {{type 'const S2 [5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp target data map(ca) // warn-warning {{type 'const S3 [5]' is not trivially copyable and not guaranteed to be mapped correctly}}
 #pragma omp target data map(da)
 #pragma omp target data map(S2::S2s)
 #pragma omp target data map(S2::S2sc)
-#pragma omp target data map(e, g) // warn-warning {{Type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} warn-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp target data map(e, g) // warn-warning {{type 'S4' is not trivially copyable and not guaranteed to be mapped correctly}} warn-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
 #pragma omp target data map(h) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
 #pragma omp target data map(k), map(k) // lt50-error {{variable already marked as mapped in current construct}} lt50-note {{used here}}
 #pragma omp target map(k), map(k[:5]) // lt50-error {{pointer cannot be mapped along with a section derived from itself}} lt50-note {{used here}}
@@ -872,7 +872,7 @@ int main(int argc, char **argv) {
   {}
 #pragma omp target firstprivate(j) map(j)  // expected-error {{firstprivate variable cannot be in a map clause in '#pragma omp target' directive}} expected-note {{defined as firstprivate}}
   {}
-#pragma omp target map(m) // warn-warning {{Type 'S6<int>' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp target map(m) // warn-warning {{type 'S6<int>' is not trivially copyable and not guaranteed to be mapped correctly}}
   {}
 #pragma omp target
   { s.a++; }
@@ -920,7 +920,7 @@ int main(int argc, char **argv) {
   { s.a++; }
 #pragma omp target map(s.s.s.b[:2])
   { s.s.s.b[0]++; }
-#pragma omp target map(s8[0:1], s9) // warn-warning {{Type 'class S8' is not trivially copyable and not guaranteed to be mapped correctly}} warn-warning {{Type 'class S9' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp target map(s8[0:1], s9) // warn-warning {{type 'class S8' is not trivially copyable and not guaranteed to be mapped correctly}} warn-warning {{type 'class S9' is not trivially copyable and not guaranteed to be mapped correctly}}
   {}
 
   int **BB, *offset, *a;
diff --git a/clang/test/OpenMP/target_parallel_for_private_messages.cpp b/clang/test/OpenMP/target_parallel_for_private_messages.cpp
index 1c31badf51cd..81b4be4923d7 100644
--- a/clang/test/OpenMP/target_parallel_for_private_messages.cpp
+++ b/clang/test/OpenMP/target_parallel_for_private_messages.cpp
@@ -56,7 +56,7 @@ public:
   S5(int v) : a(v) {}
   S5 &operator=(S5 &s) {
 #pragma omp target parallel for private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
-    for (int k = 0; k < s.a; ++k) // expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (int k = 0; k < s.a; ++k) // expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++s.a;
     return *this;
   }
diff --git a/clang/test/OpenMP/target_parallel_for_simd_private_messages.cpp b/clang/test/OpenMP/target_parallel_for_simd_private_messages.cpp
index db9d495698b0..c9b5bac0e693 100644
--- a/clang/test/OpenMP/target_parallel_for_simd_private_messages.cpp
+++ b/clang/test/OpenMP/target_parallel_for_simd_private_messages.cpp
@@ -56,7 +56,7 @@ public:
   S5(int v) : a(v) {}
   S5 &operator=(S5 &s) {
 #pragma omp target parallel for simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
-    for (int k = 0; k < s.a; ++k) // expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (int k = 0; k < s.a; ++k) // expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++s.a;
     return *this;
   }
diff --git a/clang/test/OpenMP/target_private_messages.cpp b/clang/test/OpenMP/target_private_messages.cpp
index 7ee0c8cffb9c..8cdd3a11e87a 100644
--- a/clang/test/OpenMP/target_private_messages.cpp
+++ b/clang/test/OpenMP/target_private_messages.cpp
@@ -50,7 +50,7 @@ public:
   S5(int v) : a(v) {}
   S5 &operator=(S5 &s) {
 #pragma omp target private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
-    for (int k = 0; k < s.a; ++k) // expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (int k = 0; k < s.a; ++k) // expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++s.a;
     return *this;
   }
diff --git a/clang/test/OpenMP/target_simd_private_messages.cpp b/clang/test/OpenMP/target_simd_private_messages.cpp
index 4a55a506d4ab..f6e4e714f8ff 100644
--- a/clang/test/OpenMP/target_simd_private_messages.cpp
+++ b/clang/test/OpenMP/target_simd_private_messages.cpp
@@ -56,7 +56,7 @@ public:
   S5(int v) : a(v) {}
   S5 &operator=(S5 &s) {
 #pragma omp target simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
-    for (int k = 0; k < s.a; ++k) // expected-warning {{Type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (int k = 0; k < s.a; ++k) // expected-warning {{type 'S5' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++s.a;
     return *this;
   }
diff --git a/clang/test/OpenMP/target_teams_distribute_firstprivate_messages.cpp b/clang/test/OpenMP/target_teams_distribute_firstprivate_messages.cpp
index fccf5515998d..195af52b7892 100644
--- a/clang/test/OpenMP/target_teams_distribute_firstprivate_messages.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_firstprivate_messages.cpp
@@ -119,7 +119,7 @@ int main(int argc, char **argv) {
   for (i = 0; i < argc; ++i) foo();
 
 #pragma omp target
-#pragma omp teams distribute firstprivate(ca) // expected-error {{no matching constructor for initialization of 'S3'}} expected-warning {{Type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp teams distribute firstprivate(ca) // expected-error {{no matching constructor for initialization of 'S3'}} expected-warning {{type 'const S3[5]' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (i = 0; i < argc; ++i) foo();
 
 #pragma omp target teams distribute firstprivate(da, z)
diff --git a/clang/test/OpenMP/target_update_messages.cpp b/clang/test/OpenMP/target_update_messages.cpp
index 2bf0ade9fe91..83191059202c 100644
--- a/clang/test/OpenMP/target_update_messages.cpp
+++ b/clang/test/OpenMP/target_update_messages.cpp
@@ -18,14 +18,14 @@ static int y;
 #pragma omp declare target(y)
 
 void yyy() {
-#pragma omp target update to(y) // expected-error {{the host cannot update a declare target variable that is not externally visible.}}
+#pragma omp target update to(y) // expected-error {{the host cannot update a declare target variable that is not externally visible}}
 }
 
 int __attribute__((visibility("hidden"))) z;
 #pragma omp declare target(z)
 
 void zzz() {
-#pragma omp target update from(z) // expected-error {{the host cannot update a declare target variable that is not externally visible.}}
+#pragma omp target update from(z) // expected-error {{the host cannot update a declare target variable that is not externally visible}}
 }
 
 void foo() {
diff --git a/clang/test/OpenMP/teams_distribute_loop_messages.cpp b/clang/test/OpenMP/teams_distribute_loop_messages.cpp
index 167f653e2cd7..e5f146679e5f 100644
--- a/clang/test/OpenMP/teams_distribute_loop_messages.cpp
+++ b/clang/test/OpenMP/teams_distribute_loop_messages.cpp
@@ -416,7 +416,7 @@ int test_with_random_access_iterator() {
   Iter0 begin0, end0;
 #pragma omp target
 #pragma omp teams distribute
-  for (GoodIter I = begin; I < end; ++I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I < end; ++I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute
@@ -425,31 +425,31 @@ int test_with_random_access_iterator() {
     ++I;
 #pragma omp target
 #pragma omp teams distribute
-  for (GoodIter I = begin; I >= end; --I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I >= end; --I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(begin); I < end; ++I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(begin); I < end; ++I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(nullptr); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(nullptr); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(0); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(0); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(1, 2); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(1, 2); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute
-  for (begin = GoodIter(0); begin < end; ++begin) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (begin = GoodIter(0); begin < end; ++begin) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++begin;
 #pragma omp target
 #pragma omp teams distribute
@@ -464,7 +464,7 @@ int test_with_random_access_iterator() {
     ++begin;
 #pragma omp target
 #pragma omp teams distribute
-  for (begin = end; begin < end; ++begin) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (begin = end; begin < end; ++begin) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++begin;
 #pragma omp target
 #pragma omp teams distribute
@@ -489,7 +489,7 @@ int test_with_random_access_iterator() {
     ++I;
 #pragma omp target
 #pragma omp teams distribute
-  for (GoodIter I = begin; I >= end; I = I - 1) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I >= end; I = I - 1) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute
@@ -551,19 +551,19 @@ public:
 #pragma omp teams distribute
 // expected-note@+2 {{loop step is expected to be positive due to this condition}}
 // expected-error@+1 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
-    for (IT I = begin; I < end; I = I + ST) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I < end; I = I + ST) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
 #pragma omp target
 #pragma omp teams distribute
 // expected-note@+2 {{loop step is expected to be positive due to this condition}}
 // expected-error@+1 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
-    for (IT I = begin; I <= end; I += ST) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I <= end; I += ST) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
 #pragma omp target
 #pragma omp teams distribute
-    for (IT I = begin; I < end; ++I) { // expected-warning 4 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I < end; ++I) { // expected-warning 4 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
   }
@@ -599,7 +599,7 @@ int dotest_gt(IT begin, IT end) {
 
 #pragma omp target
 #pragma omp teams distribute
-  for (IT I = begin; I < end; I += TC<int, ST>::step()) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (IT I = begin; I < end; I += TC<int, ST>::step()) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   }
 }
@@ -702,7 +702,7 @@ void test_loop_firstprivate_lastprivate() {
   S s(4);
 // expected-error@+2 {{lastprivate variable cannot be firstprivate}} expected-note@+2 {{defined as lastprivate}}
 #pragma omp target
-#pragma omp teams distribute lastprivate(s) firstprivate(s) // expected-error {{calling a private constructor of class 'S'}} expected-warning {{Type 'S' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp teams distribute lastprivate(s) firstprivate(s) // expected-error {{calling a private constructor of class 'S'}} expected-warning {{type 'S' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 16; ++i)
     ;
 }
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_loop_messages.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_loop_messages.cpp
index cdfc5eaec228..67e3ce4dc157 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_loop_messages.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_loop_messages.cpp
@@ -414,7 +414,7 @@ int test_with_random_access_iterator() {
   Iter0 begin0, end0;
 #pragma omp target
 #pragma omp teams distribute parallel for
-  for (GoodIter I = begin; I < end; ++I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I < end; ++I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for
@@ -423,31 +423,31 @@ int test_with_random_access_iterator() {
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for
-  for (GoodIter I = begin; I >= end; --I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I >= end; --I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(begin); I < end; ++I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(begin); I < end; ++I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(nullptr); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(nullptr); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(0); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(0); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(1, 2); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(1, 2); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for
-  for (begin = GoodIter(0); begin < end; ++begin) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (begin = GoodIter(0); begin < end; ++begin) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++begin;
 #pragma omp target
 #pragma omp teams distribute parallel for
@@ -462,7 +462,7 @@ int test_with_random_access_iterator() {
     ++begin;
 #pragma omp target
 #pragma omp teams distribute parallel for
-  for (begin = end; begin < end; ++begin) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (begin = end; begin < end; ++begin) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++begin;
 #pragma omp target
 #pragma omp teams distribute parallel for
@@ -487,7 +487,7 @@ int test_with_random_access_iterator() {
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for
-  for (GoodIter I = begin; I >= end; I = I - 1) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I >= end; I = I - 1) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for
@@ -549,19 +549,19 @@ public:
 #pragma omp teams distribute parallel for
 // expected-note@+2 {{loop step is expected to be positive due to this condition}}
 // expected-error@+1 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
-    for (IT I = begin; I < end; I = I + ST) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I < end; I = I + ST) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
 #pragma omp target
 #pragma omp teams distribute parallel for
 // expected-note@+2 {{loop step is expected to be positive due to this condition}}
 // expected-error@+1 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
-    for (IT I = begin; I <= end; I += ST) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I <= end; I += ST) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
 #pragma omp target
 #pragma omp teams distribute parallel for
-    for (IT I = begin; I < end; ++I) { // expected-warning 4 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I < end; ++I) { // expected-warning 4 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
   }
@@ -597,7 +597,7 @@ int dotest_gt(IT begin, IT end) {
 
 #pragma omp target
 #pragma omp teams distribute parallel for
-  for (IT I = begin; I < end; I += TC<int, ST>::step()) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (IT I = begin; I < end; I += TC<int, ST>::step()) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   }
 }
@@ -697,7 +697,7 @@ void test_loop_firstprivate_lastprivate() {
   S s(4);
 // expected-error@+2 {{lastprivate variable cannot be firstprivate}} expected-note@+2 {{defined as lastprivate}}
 #pragma omp target
-#pragma omp teams distribute parallel for lastprivate(s) firstprivate(s) // expected-error {{calling a private constructor of class 'S'}} expected-warning {{Type 'S' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp teams distribute parallel for lastprivate(s) firstprivate(s) // expected-error {{calling a private constructor of class 'S'}} expected-warning {{type 'S' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 16; ++i)
     ;
 }
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_simd_loop_messages.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_simd_loop_messages.cpp
index 645035a3a163..7ee8b9c9d367 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_simd_loop_messages.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_simd_loop_messages.cpp
@@ -416,7 +416,7 @@ int test_with_random_access_iterator() {
   Iter0 begin0, end0;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
-  for (GoodIter I = begin; I < end; ++I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I < end; ++I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
@@ -425,31 +425,31 @@ int test_with_random_access_iterator() {
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
-  for (GoodIter I = begin; I >= end; --I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I >= end; --I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(begin); I < end; ++I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(begin); I < end; ++I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(nullptr); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(nullptr); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(0); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(0); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(1, 2); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(1, 2); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
-  for (begin = GoodIter(0); begin < end; ++begin) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (begin = GoodIter(0); begin < end; ++begin) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++begin;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
@@ -464,7 +464,7 @@ int test_with_random_access_iterator() {
     ++begin;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
-  for (begin = end; begin < end; ++begin) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (begin = end; begin < end; ++begin) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++begin;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
@@ -489,7 +489,7 @@ int test_with_random_access_iterator() {
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
-  for (GoodIter I = begin; I >= end; I = I - 1) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I >= end; I = I - 1) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute parallel for simd
@@ -551,19 +551,19 @@ public:
 #pragma omp teams distribute parallel for simd
 // expected-note@+2 {{loop step is expected to be positive due to this condition}}
 // expected-error@+1 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
-    for (IT I = begin; I < end; I = I + ST) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I < end; I = I + ST) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
 #pragma omp target
 #pragma omp teams distribute parallel for simd
 // expected-note@+2 {{loop step is expected to be positive due to this condition}}
 // expected-error@+1 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
-    for (IT I = begin; I <= end; I += ST) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I <= end; I += ST) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
 #pragma omp target
 #pragma omp teams distribute parallel for simd
-    for (IT I = begin; I < end; ++I) { // expected-warning 4 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I < end; ++I) { // expected-warning 4 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
   }
@@ -599,7 +599,7 @@ int dotest_gt(IT begin, IT end) {
 
 #pragma omp target
 #pragma omp teams distribute parallel for simd
-  for (IT I = begin; I < end; I += TC<int, ST>::step()) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (IT I = begin; I < end; I += TC<int, ST>::step()) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   }
 }
@@ -699,7 +699,7 @@ void test_loop_firstprivate_lastprivate() {
   S s(4);
 // expected-error@+2 {{lastprivate variable cannot be firstprivate}} expected-note@+2 {{defined as lastprivate}}
 #pragma omp target
-#pragma omp teams distribute parallel for simd lastprivate(s) firstprivate(s) // expected-error {{calling a private constructor of class 'S'}} expected-warning {{Type 'S' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp teams distribute parallel for simd lastprivate(s) firstprivate(s) // expected-error {{calling a private constructor of class 'S'}} expected-warning {{type 'S' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 16; ++i)
     ;
 }
diff --git a/clang/test/OpenMP/teams_distribute_simd_loop_messages.cpp b/clang/test/OpenMP/teams_distribute_simd_loop_messages.cpp
index 13eef6a98b3d..8bfddbf6e9ee 100644
--- a/clang/test/OpenMP/teams_distribute_simd_loop_messages.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_loop_messages.cpp
@@ -416,7 +416,7 @@ int test_with_random_access_iterator() {
   Iter0 begin0, end0;
 #pragma omp target
 #pragma omp teams distribute simd
-  for (GoodIter I = begin; I < end; ++I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I < end; ++I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute simd
@@ -425,31 +425,31 @@ int test_with_random_access_iterator() {
     ++I;
 #pragma omp target
 #pragma omp teams distribute simd
-  for (GoodIter I = begin; I >= end; --I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I >= end; --I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute simd
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(begin); I < end; ++I) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(begin); I < end; ++I) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute simd
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(nullptr); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(nullptr); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute simd
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(0); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(0); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute simd
 // expected-warning@+1 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
-  for (GoodIter I(1, 2); I < end; ++I) // expected-warning {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I(1, 2); I < end; ++I) // expected-warning {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute simd
-  for (begin = GoodIter(0); begin < end; ++begin) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (begin = GoodIter(0); begin < end; ++begin) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++begin;
 #pragma omp target
 #pragma omp teams distribute simd
@@ -464,7 +464,7 @@ int test_with_random_access_iterator() {
     ++begin;
 #pragma omp target
 #pragma omp teams distribute simd
-  for (begin = end; begin < end; ++begin) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (begin = end; begin < end; ++begin) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++begin;
 #pragma omp target
 #pragma omp teams distribute simd
@@ -489,7 +489,7 @@ int test_with_random_access_iterator() {
     ++I;
 #pragma omp target
 #pragma omp teams distribute simd
-  for (GoodIter I = begin; I >= end; I = I - 1) // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (GoodIter I = begin; I >= end; I = I - 1) // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
 #pragma omp target
 #pragma omp teams distribute simd
@@ -551,19 +551,19 @@ public:
 #pragma omp teams distribute simd
 // expected-note@+2 {{loop step is expected to be positive due to this condition}}
 // expected-error@+1 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
-    for (IT I = begin; I < end; I = I + ST) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I < end; I = I + ST) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
 #pragma omp target
 #pragma omp teams distribute simd
 // expected-note@+2 {{loop step is expected to be positive due to this condition}}
 // expected-error@+1 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
-    for (IT I = begin; I <= end; I += ST) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I <= end; I += ST) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
 #pragma omp target
 #pragma omp teams distribute simd
-    for (IT I = begin; I < end; ++I) { // expected-warning 4 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+    for (IT I = begin; I < end; ++I) { // expected-warning 4 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
       ++I;
     }
   }
@@ -599,7 +599,7 @@ int dotest_gt(IT begin, IT end) {
 
 #pragma omp target
 #pragma omp teams distribute simd
-  for (IT I = begin; I < end; I += TC<int, ST>::step()) { // expected-warning 2 {{Type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (IT I = begin; I < end; I += TC<int, ST>::step()) { // expected-warning 2 {{type 'GoodIter' is not trivially copyable and not guaranteed to be mapped correctly}}
     ++I;
   }
 }
@@ -699,7 +699,7 @@ void test_loop_firstprivate_lastprivate() {
   S s(4);
 // expected-error@+2 {{lastprivate variable cannot be firstprivate}} expected-note@+2 {{defined as lastprivate}}
 #pragma omp target
-#pragma omp teams distribute simd lastprivate(s) firstprivate(s) // expected-error {{calling a private constructor of class 'S'}} expected-warning {{Type 'S' is not trivially copyable and not guaranteed to be mapped correctly}}
+#pragma omp teams distribute simd lastprivate(s) firstprivate(s) // expected-error {{calling a private constructor of class 'S'}} expected-warning {{type 'S' is not trivially copyable and not guaranteed to be mapped correctly}}
   for (int i = 0; i < 16; ++i)
     ;
 }
diff --git a/clang/test/OpenMP/tile_codegen.cpp b/clang/test/OpenMP/tile_codegen.cpp
index 93a3a14133ab..5fd5609b844c 100644
--- a/clang/test/OpenMP/tile_codegen.cpp
+++ b/clang/test/OpenMP/tile_codegen.cpp
@@ -1,10 +1,10 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 4
 // Check code generation
-// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fclang-abi-compat=latest -fopenmp -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fclang-abi-compat=latest -std=c++20 -fopenmp -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK1
 
 // Check same results after serialization round-trip
-// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fclang-abi-compat=latest -fopenmp -emit-pch -o %t %s
-// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fclang-abi-compat=latest -fopenmp -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK2
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fclang-abi-compat=latest -std=c++20 -fopenmp -emit-pch -o %t %s
+// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fclang-abi-compat=latest -std=c++20 -fopenmp -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK2
 // expected-no-diagnostics
 
 #ifndef HEADER
@@ -91,22 +91,38 @@ extern "C" void foo8(int a) {
 }
 
 
+typedef struct { double array[12]; } data_t;
+extern "C" void foo9(data_t data) {
+#pragma omp tile sizes(5)
+  for (double v : data.array)
+    body(v);
+}
+
+
+extern "C" void foo10(data_t data) {
+#pragma omp tile sizes(5)
+  for (double c = 42.0; double v : data.array)
+    body(c, v);
+}
+
+
 #endif /* HEADER */
-// CHECK1-LABEL: define {{[^@]+}}@body
-// CHECK1-SAME: (...) #[[ATTR0:[0-9]+]] {
+
+// CHECK1-LABEL: define dso_local void @body(
+// CHECK1-SAME: ...) #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@__cxx_global_var_init
-// CHECK1-SAME: () #[[ATTR1:[0-9]+]] section ".text.startup" {
+// CHECK1-LABEL: define internal void @__cxx_global_var_init(
+// CHECK1-SAME: ) #[[ATTR1:[0-9]+]] section ".text.startup" {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @_ZN1SC1Ev(ptr noundef nonnull align 4 dereferenceable(4) @s)
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@_ZN1SC1Ev
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 {
+// CHECK1-LABEL: define linkonce_odr void @_ZN1SC1Ev(
+// CHECK1-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -115,50 +131,52 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@_ZN1SC2Ev
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 {
+// CHECK1-LABEL: define linkonce_odr void @_ZN1SC2Ev(
+// CHECK1-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    [[I:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[I2:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTFLOOR_0_IV_I:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTILE_0_IV_I:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT:    [[I2:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    store ptr [[I2]], ptr [[I]], align 8
+// CHECK1-NEXT:    [[I:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT:    store i32 7, ptr [[I]], align 4
+// CHECK1-NEXT:    [[I3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[THIS1]], i32 0, i32 0
+// CHECK1-NEXT:    store ptr [[I3]], ptr [[I2]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK1-NEXT:    br label [[FOR_COND:%.*]]
 // CHECK1:       for.cond:
 // CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 4
-// CHECK1-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END11:%.*]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END12:%.*]]
 // CHECK1:       for.body:
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK1-NEXT:    br label [[FOR_COND3:%.*]]
-// CHECK1:       for.cond3:
+// CHECK1-NEXT:    br label [[FOR_COND4:%.*]]
+// CHECK1:       for.cond4:
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 5
-// CHECK1-NEXT:    [[CMP4:%.*]] = icmp slt i32 4, [[ADD]]
-// CHECK1-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp slt i32 4, [[ADD]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK1-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP4]], 5
+// CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP4]], 5
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 4, [[COND_TRUE]] ], [ [[ADD5]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    [[CMP6:%.*]] = icmp slt i32 [[TMP2]], [[COND]]
-// CHECK1-NEXT:    br i1 [[CMP6]], label [[FOR_BODY7:%.*]], label [[FOR_END:%.*]]
-// CHECK1:       for.body7:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 4, [[COND_TRUE]] ], [ [[ADD6]], [[COND_FALSE]] ]
+// CHECK1-NEXT:    [[CMP7:%.*]] = icmp slt i32 [[TMP2]], [[COND]]
+// CHECK1-NEXT:    br i1 [[CMP7]], label [[FOR_BODY8:%.*]], label [[FOR_END:%.*]]
+// CHECK1:       for.body8:
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP5]], 3
-// CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i32 7, [[MUL]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I]], align 8
-// CHECK1-NEXT:    store i32 [[ADD8]], ptr [[TMP6]], align 4
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[I]], align 8
+// CHECK1-NEXT:    [[ADD9:%.*]] = add nsw i32 7, [[MUL]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I2]], align 8
+// CHECK1-NEXT:    store i32 [[ADD9]], ptr [[TMP6]], align 4
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[I2]], align 8
 // CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP8]])
 // CHECK1-NEXT:    br label [[FOR_INC:%.*]]
@@ -166,20 +184,20 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK1-NEXT:    store i32 [[INC]], ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK1-NEXT:    br label [[FOR_COND3]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK1-NEXT:    br label [[FOR_COND4]], !llvm.loop [[LOOP3:![0-9]+]]
 // CHECK1:       for.end:
-// CHECK1-NEXT:    br label [[FOR_INC9:%.*]]
-// CHECK1:       for.inc9:
+// CHECK1-NEXT:    br label [[FOR_INC10:%.*]]
+// CHECK1:       for.inc10:
 // CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP10]], 5
-// CHECK1-NEXT:    store i32 [[ADD10]], ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK1-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP10]], 5
+// CHECK1-NEXT:    store i32 [[ADD11]], ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
-// CHECK1:       for.end11:
+// CHECK1:       for.end12:
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@foo1
-// CHECK1-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] {
+// CHECK1-LABEL: define dso_local void @foo1(
+// CHECK1-SAME: i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
@@ -195,81 +213,83 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    store i32 [[END]], ptr [[END_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[STEP]], ptr [[STEP_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[START_ADDR]], align 4
-// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[END_ADDR]], align 4
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_1]], align 4
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
-// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTNEW_STEP]], align 4
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP3]], [[TMP4]]
+// CHECK1-NEXT:    store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
 // CHECK1-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
-// CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], [[TMP5]]
 // CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
-// CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP6]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
 // CHECK1-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
 // CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK1-NEXT:    br label [[FOR_COND:%.*]]
 // CHECK1:       for.cond:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK1-NEXT:    [[ADD5:%.*]] = add i32 [[TMP8]], 1
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP7]], [[ADD5]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[ADD5:%.*]] = add i32 [[TMP9]], 1
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP8]], [[ADD5]]
 // CHECK1-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END17:%.*]]
 // CHECK1:       for.body:
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK1-NEXT:    store i32 [[TMP9]], ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK1-NEXT:    br label [[FOR_COND6:%.*]]
 // CHECK1:       for.cond6:
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK1-NEXT:    [[ADD7:%.*]] = add i32 [[TMP11]], 1
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP12]], 5
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[ADD7:%.*]] = add i32 [[TMP12]], 1
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK1-NEXT:    [[ADD8:%.*]] = add i32 [[TMP13]], 5
 // CHECK1-NEXT:    [[CMP9:%.*]] = icmp ult i32 [[ADD7]], [[ADD8]]
 // CHECK1-NEXT:    br i1 [[CMP9]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK1-NEXT:    [[ADD10:%.*]] = add i32 [[TMP13]], 1
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[ADD10:%.*]] = add i32 [[TMP14]], 1
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK1-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP14]], 5
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK1-NEXT:    [[ADD11:%.*]] = add i32 [[TMP15]], 5
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[ADD10]], [[COND_TRUE]] ], [ [[ADD11]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[TMP10]], [[COND]]
+// CHECK1-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[TMP11]], [[COND]]
 // CHECK1-NEXT:    br i1 [[CMP12]], label [[FOR_BODY13:%.*]], label [[FOR_END:%.*]]
 // CHECK1:       for.body13:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
-// CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP16]], [[TMP17]]
-// CHECK1-NEXT:    [[ADD14:%.*]] = add i32 [[TMP15]], [[MUL]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP17]], [[TMP18]]
+// CHECK1-NEXT:    [[ADD14:%.*]] = add i32 [[TMP16]], [[MUL]]
 // CHECK1-NEXT:    store i32 [[ADD14]], ptr [[I]], align 4
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4
-// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP18]])
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[I]], align 4
+// CHECK1-NEXT:    call void (...) @body(i32 noundef [[TMP19]])
 // CHECK1-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK1:       for.inc:
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK1-NEXT:    [[INC:%.*]] = add i32 [[TMP20]], 1
 // CHECK1-NEXT:    store i32 [[INC]], ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK1-NEXT:    br label [[FOR_COND6]], !llvm.loop [[LOOP6:![0-9]+]]
 // CHECK1:       for.end:
 // CHECK1-NEXT:    br label [[FOR_INC15:%.*]]
 // CHECK1:       for.inc15:
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK1-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP20]], 5
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK1-NEXT:    [[ADD16:%.*]] = add i32 [[TMP21]], 5
 // CHECK1-NEXT:    store i32 [[ADD16]], ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
 // CHECK1:       for.end17:
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@foo2
-// CHECK1-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] {
+// CHECK1-LABEL: define dso_local void @foo2(
+// CHECK1-SAME: i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
@@ -381,8 +401,8 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@foo3
-// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-LABEL: define dso_local void @foo3(
+// CHECK1-SAME: ) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -523,8 +543,8 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@foo4
-// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-LABEL: define dso_local void @foo4(
+// CHECK1-SAME: ) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -676,8 +696,8 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@foo5
-// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-LABEL: define dso_local void @foo5(
+// CHECK1-SAME: ) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -885,15 +905,15 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@foo6
-// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-LABEL: define dso_local void @foo6(
+// CHECK1-SAME: ) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 0, ptr @foo6.omp_outlined)
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@foo6.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK1-LABEL: define internal void @foo6.omp_outlined(
+// CHECK1-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -988,15 +1008,15 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@tfoo7
-// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-LABEL: define dso_local void @tfoo7(
+// CHECK1-SAME: ) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @_Z4foo7IiTnT_Li3ETnS0_Li5EEvS0_S0_(i32 noundef 0, i32 noundef 42)
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@_Z4foo7IiTnT_Li3ETnS0_Li5EEvS0_S0_
-// CHECK1-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]]) #[[ATTR0]] comdat {
+// CHECK1-LABEL: define linkonce_odr void @_Z4foo7IiTnT_Li3ETnS0_Li5EEvS0_S0_(
+// CHECK1-SAME: i32 noundef [[START:%.*]], i32 noundef [[END:%.*]]) #[[ATTR0]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
@@ -1039,7 +1059,7 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
 // CHECK1-NEXT:    [[ADD7:%.*]] = add i32 [[TMP9]], 1
 // CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP10]], 5
+// CHECK1-NEXT:    [[ADD8:%.*]] = add i32 [[TMP10]], 5
 // CHECK1-NEXT:    [[CMP9:%.*]] = icmp ult i32 [[ADD7]], [[ADD8]]
 // CHECK1-NEXT:    br i1 [[CMP9]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
@@ -1048,7 +1068,7 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
 // CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK1-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP12]], 5
+// CHECK1-NEXT:    [[ADD11:%.*]] = add i32 [[TMP12]], 5
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[ADD10]], [[COND_TRUE]] ], [ [[ADD11]], [[COND_FALSE]] ]
@@ -1065,22 +1085,22 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK1:       for.inc:
 // CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK1-NEXT:    [[INC:%.*]] = add i32 [[TMP16]], 1
 // CHECK1-NEXT:    store i32 [[INC]], ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK1-NEXT:    br label [[FOR_COND6]], !llvm.loop [[LOOP21:![0-9]+]]
 // CHECK1:       for.end:
 // CHECK1-NEXT:    br label [[FOR_INC15:%.*]]
 // CHECK1:       for.inc15:
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK1-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP17]], 5
+// CHECK1-NEXT:    [[ADD16:%.*]] = add i32 [[TMP17]], 5
 // CHECK1-NEXT:    store i32 [[ADD16]], ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 // CHECK1:       for.end17:
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@foo8
-// CHECK1-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK1-LABEL: define dso_local void @foo8(
+// CHECK1-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
@@ -1168,22 +1188,219 @@ extern "C" void foo8(int a) {
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK1-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_tile_codegen.cpp
-// CHECK1-SAME: () #[[ATTR1]] section ".text.startup" {
+// CHECK1-LABEL: define dso_local void @foo9(
+// CHECK1-SAME: ptr noundef byval([[STRUCT_DATA_T:%.*]]) align 8 [[DATA:%.*]]) #[[ATTR0]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_3:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_4:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTFLOOR_0_IV___BEGIN2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTTILE_0_IV___BEGIN2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[V:%.*]] = alloca double, align 8
+// CHECK1-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds [[STRUCT_DATA_T]], ptr [[DATA]], i32 0, i32 0
+// CHECK1-NEXT:    store ptr [[ARRAY]], ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP0]], i64 0, i64 0
+// CHECK1-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 12
+// CHECK1-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP1]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY1]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY2:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP2]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY2]], ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP4]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP5]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK1-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i64 [[SUB]], 1
+// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i64 [[ADD]], 1
+// CHECK1-NEXT:    [[SUB5:%.*]] = sub nsw i64 [[DIV]], 1
+// CHECK1-NEXT:    store i64 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK1-NEXT:    store i64 0, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK1:       for.cond:
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP7]], 1
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i64 [[TMP6]], [[ADD6]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END18:%.*]]
+// CHECK1:       for.body:
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    store i64 [[TMP8]], ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    br label [[FOR_COND7:%.*]]
+// CHECK1:       for.cond7:
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i64 [[TMP10]], 1
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[ADD9:%.*]] = add nsw i64 [[TMP11]], 5
+// CHECK1-NEXT:    [[CMP10:%.*]] = icmp slt i64 [[ADD8]], [[ADD9]]
+// CHECK1-NEXT:    br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1:       cond.true:
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK1-NEXT:    [[ADD11:%.*]] = add nsw i64 [[TMP12]], 1
+// CHECK1-NEXT:    br label [[COND_END:%.*]]
+// CHECK1:       cond.false:
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[ADD12:%.*]] = add nsw i64 [[TMP13]], 5
+// CHECK1-NEXT:    br label [[COND_END]]
+// CHECK1:       cond.end:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i64 [ [[ADD11]], [[COND_TRUE]] ], [ [[ADD12]], [[COND_FALSE]] ]
+// CHECK1-NEXT:    [[CMP13:%.*]] = icmp slt i64 [[TMP9]], [[COND]]
+// CHECK1-NEXT:    br i1 [[CMP13]], label [[FOR_BODY14:%.*]], label [[FOR_END:%.*]]
+// CHECK1:       for.body14:
+// CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP15]], 1
+// CHECK1-NEXT:    [[ADD_PTR15:%.*]] = getelementptr inbounds double, ptr [[TMP14]], i64 [[MUL]]
+// CHECK1-NEXT:    store ptr [[ADD_PTR15]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP17:%.*]] = load double, ptr [[TMP16]], align 8
+// CHECK1-NEXT:    store double [[TMP17]], ptr [[V]], align 8
+// CHECK1-NEXT:    [[TMP18:%.*]] = load double, ptr [[V]], align 8
+// CHECK1-NEXT:    call void (...) @body(double noundef [[TMP18]])
+// CHECK1-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK1:       for.inc:
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[INC:%.*]] = add nsw i64 [[TMP19]], 1
+// CHECK1-NEXT:    store i64 [[INC]], ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    br label [[FOR_COND7]], !llvm.loop [[LOOP25:![0-9]+]]
+// CHECK1:       for.end:
+// CHECK1-NEXT:    br label [[FOR_INC16:%.*]]
+// CHECK1:       for.inc16:
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[ADD17:%.*]] = add nsw i64 [[TMP20]], 5
+// CHECK1-NEXT:    store i64 [[ADD17]], ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
+// CHECK1:       for.end18:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define dso_local void @foo10(
+// CHECK1-SAME: ptr noundef byval([[STRUCT_DATA_T:%.*]]) align 8 [[DATA:%.*]]) #[[ATTR0]] {
+// CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[C:%.*]] = alloca double, align 8
+// CHECK1-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_3:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_4:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTFLOOR_0_IV___BEGIN2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[DOTTILE_0_IV___BEGIN2:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    [[V:%.*]] = alloca double, align 8
+// CHECK1-NEXT:    store double 4.200000e+01, ptr [[C]], align 8
+// CHECK1-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds [[STRUCT_DATA_T]], ptr [[DATA]], i32 0, i32 0
+// CHECK1-NEXT:    store ptr [[ARRAY]], ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP0]], i64 0, i64 0
+// CHECK1-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 12
+// CHECK1-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP1]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY1]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK1-NEXT:    [[ARRAYDECAY2:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP2]], i64 0, i64 0
+// CHECK1-NEXT:    store ptr [[ARRAYDECAY2]], ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP4]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP5]] to i64
+// CHECK1-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK1-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i64 [[SUB]], 1
+// CHECK1-NEXT:    [[DIV:%.*]] = sdiv i64 [[ADD]], 1
+// CHECK1-NEXT:    [[SUB5:%.*]] = sub nsw i64 [[DIV]], 1
+// CHECK1-NEXT:    store i64 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK1-NEXT:    store i64 0, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK1:       for.cond:
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP7]], 1
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i64 [[TMP6]], [[ADD6]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END18:%.*]]
+// CHECK1:       for.body:
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    store i64 [[TMP8]], ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    br label [[FOR_COND7:%.*]]
+// CHECK1:       for.cond7:
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i64 [[TMP10]], 1
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[ADD9:%.*]] = add nsw i64 [[TMP11]], 5
+// CHECK1-NEXT:    [[CMP10:%.*]] = icmp slt i64 [[ADD8]], [[ADD9]]
+// CHECK1-NEXT:    br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1:       cond.true:
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK1-NEXT:    [[ADD11:%.*]] = add nsw i64 [[TMP12]], 1
+// CHECK1-NEXT:    br label [[COND_END:%.*]]
+// CHECK1:       cond.false:
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[ADD12:%.*]] = add nsw i64 [[TMP13]], 5
+// CHECK1-NEXT:    br label [[COND_END]]
+// CHECK1:       cond.end:
+// CHECK1-NEXT:    [[COND:%.*]] = phi i64 [ [[ADD11]], [[COND_TRUE]] ], [ [[ADD12]], [[COND_FALSE]] ]
+// CHECK1-NEXT:    [[CMP13:%.*]] = icmp slt i64 [[TMP9]], [[COND]]
+// CHECK1-NEXT:    br i1 [[CMP13]], label [[FOR_BODY14:%.*]], label [[FOR_END:%.*]]
+// CHECK1:       for.body14:
+// CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP15]], 1
+// CHECK1-NEXT:    [[ADD_PTR15:%.*]] = getelementptr inbounds double, ptr [[TMP14]], i64 [[MUL]]
+// CHECK1-NEXT:    store ptr [[ADD_PTR15]], ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK1-NEXT:    [[TMP17:%.*]] = load double, ptr [[TMP16]], align 8
+// CHECK1-NEXT:    store double [[TMP17]], ptr [[V]], align 8
+// CHECK1-NEXT:    [[TMP18:%.*]] = load double, ptr [[C]], align 8
+// CHECK1-NEXT:    [[TMP19:%.*]] = load double, ptr [[V]], align 8
+// CHECK1-NEXT:    call void (...) @body(double noundef [[TMP18]], double noundef [[TMP19]])
+// CHECK1-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK1:       for.inc:
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[INC:%.*]] = add nsw i64 [[TMP20]], 1
+// CHECK1-NEXT:    store i64 [[INC]], ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    br label [[FOR_COND7]], !llvm.loop [[LOOP27:![0-9]+]]
+// CHECK1:       for.end:
+// CHECK1-NEXT:    br label [[FOR_INC16:%.*]]
+// CHECK1:       for.inc16:
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    [[ADD17:%.*]] = add nsw i64 [[TMP21]], 5
+// CHECK1-NEXT:    store i64 [[ADD17]], ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
+// CHECK1:       for.end18:
+// CHECK1-NEXT:    ret void
+//
+//
+// CHECK1-LABEL: define internal void @_GLOBAL__sub_I_tile_codegen.cpp(
+// CHECK1-SAME: ) #[[ATTR1]] section ".text.startup" {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__cxx_global_var_init()
 // CHECK1-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@__cxx_global_var_init
-// CHECK2-SAME: () #[[ATTR0:[0-9]+]] section ".text.startup" {
+// CHECK2-LABEL: define internal void @__cxx_global_var_init(
+// CHECK2-SAME: ) #[[ATTR0:[0-9]+]] section ".text.startup" {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    call void @_ZN1SC1Ev(ptr noundef nonnull align 4 dereferenceable(4) @s)
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@_ZN1SC1Ev
-// CHECK2-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
+// CHECK2-LABEL: define linkonce_odr void @_ZN1SC1Ev(
+// CHECK2-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -1192,50 +1409,52 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@_ZN1SC2Ev
-// CHECK2-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
+// CHECK2-LABEL: define linkonce_odr void @_ZN1SC2Ev(
+// CHECK2-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK2-NEXT:    [[I:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[I2:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[DOTFLOOR_0_IV_I:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTILE_0_IV_I:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK2-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK2-NEXT:    [[I2:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
-// CHECK2-NEXT:    store ptr [[I2]], ptr [[I]], align 8
+// CHECK2-NEXT:    [[I:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK2-NEXT:    store i32 7, ptr [[I]], align 4
+// CHECK2-NEXT:    [[I3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[THIS1]], i32 0, i32 0
+// CHECK2-NEXT:    store ptr [[I3]], ptr [[I2]], align 8
 // CHECK2-NEXT:    store i32 0, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK2-NEXT:    br label [[FOR_COND:%.*]]
 // CHECK2:       for.cond:
 // CHECK2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 4
-// CHECK2-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END11:%.*]]
+// CHECK2-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END12:%.*]]
 // CHECK2:       for.body:
 // CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK2-NEXT:    store i32 [[TMP1]], ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND3:%.*]]
-// CHECK2:       for.cond3:
+// CHECK2-NEXT:    br label [[FOR_COND4:%.*]]
+// CHECK2:       for.cond4:
 // CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP3]], 5
-// CHECK2-NEXT:    [[CMP4:%.*]] = icmp slt i32 4, [[ADD]]
-// CHECK2-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2-NEXT:    [[CMP5:%.*]] = icmp slt i32 4, [[ADD]]
+// CHECK2-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK2:       cond.true:
 // CHECK2-NEXT:    br label [[COND_END:%.*]]
 // CHECK2:       cond.false:
 // CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP4]], 5
+// CHECK2-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP4]], 5
 // CHECK2-NEXT:    br label [[COND_END]]
 // CHECK2:       cond.end:
-// CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ 4, [[COND_TRUE]] ], [ [[ADD5]], [[COND_FALSE]] ]
-// CHECK2-NEXT:    [[CMP6:%.*]] = icmp slt i32 [[TMP2]], [[COND]]
-// CHECK2-NEXT:    br i1 [[CMP6]], label [[FOR_BODY7:%.*]], label [[FOR_END:%.*]]
-// CHECK2:       for.body7:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ 4, [[COND_TRUE]] ], [ [[ADD6]], [[COND_FALSE]] ]
+// CHECK2-NEXT:    [[CMP7:%.*]] = icmp slt i32 [[TMP2]], [[COND]]
+// CHECK2-NEXT:    br i1 [[CMP7]], label [[FOR_BODY8:%.*]], label [[FOR_END:%.*]]
+// CHECK2:       for.body8:
 // CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP5]], 3
-// CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i32 7, [[MUL]]
-// CHECK2-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I]], align 8
-// CHECK2-NEXT:    store i32 [[ADD8]], ptr [[TMP6]], align 4
-// CHECK2-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[I]], align 8
+// CHECK2-NEXT:    [[ADD9:%.*]] = add nsw i32 7, [[MUL]]
+// CHECK2-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[I2]], align 8
+// CHECK2-NEXT:    store i32 [[ADD9]], ptr [[TMP6]], align 4
+// CHECK2-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[I2]], align 8
 // CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
 // CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP8]])
 // CHECK2-NEXT:    br label [[FOR_INC:%.*]]
@@ -1243,26 +1462,26 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND3]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND4]], !llvm.loop [[LOOP3:![0-9]+]]
 // CHECK2:       for.end:
-// CHECK2-NEXT:    br label [[FOR_INC9:%.*]]
-// CHECK2:       for.inc9:
+// CHECK2-NEXT:    br label [[FOR_INC10:%.*]]
+// CHECK2:       for.inc10:
 // CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP10]], 5
-// CHECK2-NEXT:    store i32 [[ADD10]], ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK2-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP10]], 5
+// CHECK2-NEXT:    store i32 [[ADD11]], ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
-// CHECK2:       for.end11:
+// CHECK2:       for.end12:
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@body
-// CHECK2-SAME: (...) #[[ATTR1]] {
+// CHECK2-LABEL: define dso_local void @body(
+// CHECK2-SAME: ...) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@foo1
-// CHECK2-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR1]] {
+// CHECK2-LABEL: define dso_local void @foo1(
+// CHECK2-SAME: i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
@@ -1278,81 +1497,183 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    store i32 [[END]], ptr [[END_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[STEP]], ptr [[STEP_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[START_ADDR]], align 4
-// CHECK2-NEXT:    store i32 [[TMP0]], ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[END_ADDR]], align 4
-// CHECK2-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_1]], align 4
-// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
-// CHECK2-NEXT:    store i32 [[TMP2]], ptr [[DOTNEW_STEP]], align 4
-// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
-// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK2-NEXT:    [[SUB:%.*]] = sub i32 [[TMP3]], [[TMP4]]
+// CHECK2-NEXT:    store i32 [[TMP0]], ptr [[I]], align 4
+// CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[START_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[END_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[STEP_ADDR]], align 4
+// CHECK2-NEXT:    store i32 [[TMP3]], ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[SUB:%.*]] = sub i32 [[TMP4]], [[TMP5]]
 // CHECK2-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
-// CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
-// CHECK2-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], [[TMP5]]
 // CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
-// CHECK2-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP6]]
+// CHECK2-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], [[TMP6]]
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], [[TMP7]]
 // CHECK2-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
 // CHECK2-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
 // CHECK2-NEXT:    store i32 0, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK2-NEXT:    br label [[FOR_COND:%.*]]
 // CHECK2:       for.cond:
-// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK2-NEXT:    [[ADD5:%.*]] = add i32 [[TMP8]], 1
-// CHECK2-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP7]], [[ADD5]]
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[ADD5:%.*]] = add i32 [[TMP9]], 1
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP8]], [[ADD5]]
 // CHECK2-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END17:%.*]]
 // CHECK2:       for.body:
-// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    store i32 [[TMP9]], ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK2-NEXT:    store i32 [[TMP10]], ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK2-NEXT:    br label [[FOR_COND6:%.*]]
 // CHECK2:       for.cond6:
-// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK2-NEXT:    [[ADD7:%.*]] = add i32 [[TMP11]], 1
-// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP12]], 5
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[ADD7:%.*]] = add i32 [[TMP12]], 1
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK2-NEXT:    [[ADD8:%.*]] = add i32 [[TMP13]], 5
 // CHECK2-NEXT:    [[CMP9:%.*]] = icmp ult i32 [[ADD7]], [[ADD8]]
 // CHECK2-NEXT:    br i1 [[CMP9]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK2:       cond.true:
-// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK2-NEXT:    [[ADD10:%.*]] = add i32 [[TMP13]], 1
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[ADD10:%.*]] = add i32 [[TMP14]], 1
 // CHECK2-NEXT:    br label [[COND_END:%.*]]
 // CHECK2:       cond.false:
-// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP14]], 5
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK2-NEXT:    [[ADD11:%.*]] = add i32 [[TMP15]], 5
 // CHECK2-NEXT:    br label [[COND_END]]
 // CHECK2:       cond.end:
 // CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ [[ADD10]], [[COND_TRUE]] ], [ [[ADD11]], [[COND_FALSE]] ]
-// CHECK2-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[TMP10]], [[COND]]
+// CHECK2-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[TMP11]], [[COND]]
 // CHECK2-NEXT:    br i1 [[CMP12]], label [[FOR_BODY13:%.*]], label [[FOR_END:%.*]]
 // CHECK2:       for.body13:
-// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
-// CHECK2-NEXT:    [[MUL:%.*]] = mul i32 [[TMP16]], [[TMP17]]
-// CHECK2-NEXT:    [[ADD14:%.*]] = add i32 [[TMP15]], [[MUL]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTNEW_STEP]], align 4
+// CHECK2-NEXT:    [[MUL:%.*]] = mul i32 [[TMP17]], [[TMP18]]
+// CHECK2-NEXT:    [[ADD14:%.*]] = add i32 [[TMP16]], [[MUL]]
 // CHECK2-NEXT:    store i32 [[ADD14]], ptr [[I]], align 4
-// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4
-// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP18]])
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[I]], align 4
+// CHECK2-NEXT:    call void (...) @body(i32 noundef [[TMP19]])
 // CHECK2-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK2:       for.inc:
-// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
+// CHECK2-NEXT:    [[INC:%.*]] = add i32 [[TMP20]], 1
 // CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK2-NEXT:    br label [[FOR_COND6]], !llvm.loop [[LOOP6:![0-9]+]]
 // CHECK2:       for.end:
 // CHECK2-NEXT:    br label [[FOR_INC15:%.*]]
 // CHECK2:       for.inc15:
-// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP20]], 5
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
+// CHECK2-NEXT:    [[ADD16:%.*]] = add i32 [[TMP21]], 5
 // CHECK2-NEXT:    store i32 [[ADD16]], ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
 // CHECK2:       for.end17:
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@foo2
-// CHECK2-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR1]] {
+// CHECK2-LABEL: define dso_local void @foo10(
+// CHECK2-SAME: ptr noundef byval([[STRUCT_DATA_T:%.*]]) align 8 [[DATA:%.*]]) #[[ATTR1]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[C:%.*]] = alloca double, align 8
+// CHECK2-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_3:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_4:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTFLOOR_0_IV___BEGIN2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTTILE_0_IV___BEGIN2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[V:%.*]] = alloca double, align 8
+// CHECK2-NEXT:    store double 4.200000e+01, ptr [[C]], align 8
+// CHECK2-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds [[STRUCT_DATA_T]], ptr [[DATA]], i32 0, i32 0
+// CHECK2-NEXT:    store ptr [[ARRAY]], ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP0]], i64 0, i64 0
+// CHECK2-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 12
+// CHECK2-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP1]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY1]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY2:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP2]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY2]], ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK2-NEXT:    store ptr [[TMP3]], ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK2-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP4]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP5]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK2-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i64 [[SUB]], 1
+// CHECK2-NEXT:    [[DIV:%.*]] = sdiv i64 [[ADD]], 1
+// CHECK2-NEXT:    [[SUB5:%.*]] = sub nsw i64 [[DIV]], 1
+// CHECK2-NEXT:    store i64 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK2-NEXT:    store i64 0, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK2:       for.cond:
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK2-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP7]], 1
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp slt i64 [[TMP6]], [[ADD6]]
+// CHECK2-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END18:%.*]]
+// CHECK2:       for.body:
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    store i64 [[TMP8]], ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    br label [[FOR_COND7:%.*]]
+// CHECK2:       for.cond7:
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i64 [[TMP10]], 1
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[ADD9:%.*]] = add nsw i64 [[TMP11]], 5
+// CHECK2-NEXT:    [[CMP10:%.*]] = icmp slt i64 [[ADD8]], [[ADD9]]
+// CHECK2-NEXT:    br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2:       cond.true:
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK2-NEXT:    [[ADD11:%.*]] = add nsw i64 [[TMP12]], 1
+// CHECK2-NEXT:    br label [[COND_END:%.*]]
+// CHECK2:       cond.false:
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[ADD12:%.*]] = add nsw i64 [[TMP13]], 5
+// CHECK2-NEXT:    br label [[COND_END]]
+// CHECK2:       cond.end:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i64 [ [[ADD11]], [[COND_TRUE]] ], [ [[ADD12]], [[COND_FALSE]] ]
+// CHECK2-NEXT:    [[CMP13:%.*]] = icmp slt i64 [[TMP9]], [[COND]]
+// CHECK2-NEXT:    br i1 [[CMP13]], label [[FOR_BODY14:%.*]], label [[FOR_END:%.*]]
+// CHECK2:       for.body14:
+// CHECK2-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP15]], 1
+// CHECK2-NEXT:    [[ADD_PTR15:%.*]] = getelementptr inbounds double, ptr [[TMP14]], i64 [[MUL]]
+// CHECK2-NEXT:    store ptr [[ADD_PTR15]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP17:%.*]] = load double, ptr [[TMP16]], align 8
+// CHECK2-NEXT:    store double [[TMP17]], ptr [[V]], align 8
+// CHECK2-NEXT:    [[TMP18:%.*]] = load double, ptr [[C]], align 8
+// CHECK2-NEXT:    [[TMP19:%.*]] = load double, ptr [[V]], align 8
+// CHECK2-NEXT:    call void (...) @body(double noundef [[TMP18]], double noundef [[TMP19]])
+// CHECK2-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK2:       for.inc:
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[INC:%.*]] = add nsw i64 [[TMP20]], 1
+// CHECK2-NEXT:    store i64 [[INC]], ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    br label [[FOR_COND7]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK2:       for.end:
+// CHECK2-NEXT:    br label [[FOR_INC16:%.*]]
+// CHECK2:       for.inc16:
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[ADD17:%.*]] = add nsw i64 [[TMP21]], 5
+// CHECK2-NEXT:    store i64 [[ADD17]], ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK2:       for.end18:
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @foo2(
+// CHECK2-SAME: i32 noundef [[START:%.*]], i32 noundef [[END:%.*]], i32 noundef [[STEP:%.*]]) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
@@ -1438,34 +1759,34 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTTILE_1_IV_J]], align 4
 // CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP14]], 1
 // CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTTILE_1_IV_J]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND10]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND10]], !llvm.loop [[LOOP10:![0-9]+]]
 // CHECK2:       for.end:
 // CHECK2-NEXT:    br label [[FOR_INC22:%.*]]
 // CHECK2:       for.inc22:
 // CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK2-NEXT:    [[INC23:%.*]] = add nsw i32 [[TMP15]], 1
 // CHECK2-NEXT:    store i32 [[INC23]], ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND4]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND4]], !llvm.loop [[LOOP11:![0-9]+]]
 // CHECK2:       for.end24:
 // CHECK2-NEXT:    br label [[FOR_INC25:%.*]]
 // CHECK2:       for.inc25:
 // CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTFLOOR_1_IV_J]], align 4
 // CHECK2-NEXT:    [[ADD26:%.*]] = add nsw i32 [[TMP16]], 5
 // CHECK2-NEXT:    store i32 [[ADD26]], ptr [[DOTFLOOR_1_IV_J]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND1]], !llvm.loop [[LOOP10:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND1]], !llvm.loop [[LOOP12:![0-9]+]]
 // CHECK2:       for.end27:
 // CHECK2-NEXT:    br label [[FOR_INC28:%.*]]
 // CHECK2:       for.inc28:
 // CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK2-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP17]], 5
 // CHECK2-NEXT:    store i32 [[ADD29]], ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
 // CHECK2:       for.end30:
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@foo3
-// CHECK2-SAME: () #[[ATTR1]] {
+// CHECK2-LABEL: define dso_local void @foo3(
+// CHECK2-SAME: ) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -1574,21 +1895,21 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTTILE_1_IV_J]], align 4
 // CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP20]], 1
 // CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTTILE_1_IV_J]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND15]], !llvm.loop [[LOOP12:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND15]], !llvm.loop [[LOOP14:![0-9]+]]
 // CHECK2:       for.end:
 // CHECK2-NEXT:    br label [[FOR_INC27:%.*]]
 // CHECK2:       for.inc27:
 // CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK2-NEXT:    [[INC28:%.*]] = add nsw i32 [[TMP21]], 1
 // CHECK2-NEXT:    store i32 [[INC28]], ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND3]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND3]], !llvm.loop [[LOOP15:![0-9]+]]
 // CHECK2:       for.end29:
 // CHECK2-NEXT:    br label [[FOR_INC30:%.*]]
 // CHECK2:       for.inc30:
 // CHECK2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTFLOOR_1_IV_J]], align 4
 // CHECK2-NEXT:    [[ADD31:%.*]] = add nsw i32 [[TMP22]], 5
 // CHECK2-NEXT:    store i32 [[ADD31]], ptr [[DOTFLOOR_1_IV_J]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 // CHECK2:       for.end32:
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK2:       omp.body.continue:
@@ -1606,8 +1927,8 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@foo4
-// CHECK2-SAME: () #[[ATTR1]] {
+// CHECK2-LABEL: define dso_local void @foo4(
+// CHECK2-SAME: ) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -1727,21 +2048,21 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTTILE_1_IV_J]], align 4
 // CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP22]], 1
 // CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTTILE_1_IV_J]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND20]], !llvm.loop [[LOOP15:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND20]], !llvm.loop [[LOOP17:![0-9]+]]
 // CHECK2:       for.end:
 // CHECK2-NEXT:    br label [[FOR_INC32:%.*]]
 // CHECK2:       for.inc32:
 // CHECK2-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK2-NEXT:    [[INC33:%.*]] = add nsw i32 [[TMP23]], 1
 // CHECK2-NEXT:    store i32 [[INC33]], ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND8]], !llvm.loop [[LOOP16:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND8]], !llvm.loop [[LOOP18:![0-9]+]]
 // CHECK2:       for.end34:
 // CHECK2-NEXT:    br label [[FOR_INC35:%.*]]
 // CHECK2:       for.inc35:
 // CHECK2-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTFLOOR_1_IV_J]], align 4
 // CHECK2-NEXT:    [[ADD36:%.*]] = add nsw i32 [[TMP24]], 5
 // CHECK2-NEXT:    store i32 [[ADD36]], ptr [[DOTFLOOR_1_IV_J]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 // CHECK2:       for.end37:
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK2:       omp.body.continue:
@@ -1759,8 +2080,8 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@foo5
-// CHECK2-SAME: () #[[ATTR1]] {
+// CHECK2-LABEL: define dso_local void @foo5(
+// CHECK2-SAME: ) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTOMP_IV:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -1968,15 +2289,15 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@foo6
-// CHECK2-SAME: () #[[ATTR1]] {
+// CHECK2-LABEL: define dso_local void @foo6(
+// CHECK2-SAME: ) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 0, ptr @foo6.omp_outlined)
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@foo6.omp_outlined
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK2-LABEL: define internal void @foo6.omp_outlined(
+// CHECK2-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2054,7 +2375,7 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP14]], 1
 // CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
 // CHECK2:       for.end:
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK2:       omp.body.continue:
@@ -2071,8 +2392,8 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@foo8
-// CHECK2-SAME: (i32 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK2-LABEL: define dso_local void @foo8(
+// CHECK2-SAME: i32 noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
@@ -2138,7 +2459,7 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
 // CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP11]], 1
 // CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND1]], !llvm.loop [[LOOP21:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND1]], !llvm.loop [[LOOP23:![0-9]+]]
 // CHECK2:       for.end:
 // CHECK2-NEXT:    br label [[FOR_INC17:%.*]]
 // CHECK2:       for.inc17:
@@ -2155,20 +2476,117 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
 // CHECK2-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP14]], [[COND22]]
 // CHECK2-NEXT:    store i32 [[ADD23]], ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
 // CHECK2:       for.end24:
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@tfoo7
-// CHECK2-SAME: () #[[ATTR1]] {
+// CHECK2-LABEL: define dso_local void @foo9(
+// CHECK2-SAME: ptr noundef byval([[STRUCT_DATA_T:%.*]]) align 8 [[DATA:%.*]]) #[[ATTR1]] {
+// CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[__RANGE2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__END2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[__BEGIN2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_3:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_4:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTFLOOR_0_IV___BEGIN2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[DOTTILE_0_IV___BEGIN2:%.*]] = alloca i64, align 8
+// CHECK2-NEXT:    [[V:%.*]] = alloca double, align 8
+// CHECK2-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds [[STRUCT_DATA_T]], ptr [[DATA]], i32 0, i32 0
+// CHECK2-NEXT:    store ptr [[ARRAY]], ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP0]], i64 0, i64 0
+// CHECK2-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds double, ptr [[ARRAYDECAY]], i64 12
+// CHECK2-NEXT:    store ptr [[ADD_PTR]], ptr [[__END2]], align 8
+// CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP1]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY1]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__RANGE2]], align 8
+// CHECK2-NEXT:    [[ARRAYDECAY2:%.*]] = getelementptr inbounds [12 x double], ptr [[TMP2]], i64 0, i64 0
+// CHECK2-NEXT:    store ptr [[ARRAYDECAY2]], ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__END2]], align 8
+// CHECK2-NEXT:    store ptr [[TMP3]], ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK2-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_3]], align 8
+// CHECK2-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP4]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP5]] to i64
+// CHECK2-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+// CHECK2-NEXT:    [[SUB_PTR_DIV:%.*]] = sdiv exact i64 [[SUB_PTR_SUB]], 8
+// CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i64 [[SUB_PTR_DIV]], 1
+// CHECK2-NEXT:    [[ADD:%.*]] = add nsw i64 [[SUB]], 1
+// CHECK2-NEXT:    [[DIV:%.*]] = sdiv i64 [[ADD]], 1
+// CHECK2-NEXT:    [[SUB5:%.*]] = sub nsw i64 [[DIV]], 1
+// CHECK2-NEXT:    store i64 [[SUB5]], ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK2-NEXT:    store i64 0, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK2:       for.cond:
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK2-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP7]], 1
+// CHECK2-NEXT:    [[CMP:%.*]] = icmp slt i64 [[TMP6]], [[ADD6]]
+// CHECK2-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END18:%.*]]
+// CHECK2:       for.body:
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    store i64 [[TMP8]], ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    br label [[FOR_COND7:%.*]]
+// CHECK2:       for.cond7:
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i64 [[TMP10]], 1
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[ADD9:%.*]] = add nsw i64 [[TMP11]], 5
+// CHECK2-NEXT:    [[CMP10:%.*]] = icmp slt i64 [[ADD8]], [[ADD9]]
+// CHECK2-NEXT:    br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2:       cond.true:
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR_4]], align 8
+// CHECK2-NEXT:    [[ADD11:%.*]] = add nsw i64 [[TMP12]], 1
+// CHECK2-NEXT:    br label [[COND_END:%.*]]
+// CHECK2:       cond.false:
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[ADD12:%.*]] = add nsw i64 [[TMP13]], 5
+// CHECK2-NEXT:    br label [[COND_END]]
+// CHECK2:       cond.end:
+// CHECK2-NEXT:    [[COND:%.*]] = phi i64 [ [[ADD11]], [[COND_TRUE]] ], [ [[ADD12]], [[COND_FALSE]] ]
+// CHECK2-NEXT:    [[CMP13:%.*]] = icmp slt i64 [[TMP9]], [[COND]]
+// CHECK2-NEXT:    br i1 [[CMP13]], label [[FOR_BODY14:%.*]], label [[FOR_END:%.*]]
+// CHECK2:       for.body14:
+// CHECK2-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTCAPTURE_EXPR_]], align 8
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP15]], 1
+// CHECK2-NEXT:    [[ADD_PTR15:%.*]] = getelementptr inbounds double, ptr [[TMP14]], i64 [[MUL]]
+// CHECK2-NEXT:    store ptr [[ADD_PTR15]], ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[__BEGIN2]], align 8
+// CHECK2-NEXT:    [[TMP17:%.*]] = load double, ptr [[TMP16]], align 8
+// CHECK2-NEXT:    store double [[TMP17]], ptr [[V]], align 8
+// CHECK2-NEXT:    [[TMP18:%.*]] = load double, ptr [[V]], align 8
+// CHECK2-NEXT:    call void (...) @body(double noundef [[TMP18]])
+// CHECK2-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK2:       for.inc:
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i64, ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[INC:%.*]] = add nsw i64 [[TMP19]], 1
+// CHECK2-NEXT:    store i64 [[INC]], ptr [[DOTTILE_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    br label [[FOR_COND7]], !llvm.loop [[LOOP25:![0-9]+]]
+// CHECK2:       for.end:
+// CHECK2-NEXT:    br label [[FOR_INC16:%.*]]
+// CHECK2:       for.inc16:
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i64, ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    [[ADD17:%.*]] = add nsw i64 [[TMP20]], 5
+// CHECK2-NEXT:    store i64 [[ADD17]], ptr [[DOTFLOOR_0_IV___BEGIN2]], align 8
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
+// CHECK2:       for.end18:
+// CHECK2-NEXT:    ret void
+//
+//
+// CHECK2-LABEL: define dso_local void @tfoo7(
+// CHECK2-SAME: ) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    call void @_Z4foo7IiTnT_Li3ETnS0_Li5EEvS0_S0_(i32 noundef 0, i32 noundef 42)
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@_Z4foo7IiTnT_Li3ETnS0_Li5EEvS0_S0_
-// CHECK2-SAME: (i32 noundef [[START:%.*]], i32 noundef [[END:%.*]]) #[[ATTR1]] comdat {
+// CHECK2-LABEL: define linkonce_odr void @_Z4foo7IiTnT_Li3ETnS0_Li5EEvS0_S0_(
+// CHECK2-SAME: i32 noundef [[START:%.*]], i32 noundef [[END:%.*]]) #[[ATTR1]] comdat {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[START_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[END_ADDR:%.*]] = alloca i32, align 4
@@ -2211,7 +2629,7 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
 // CHECK2-NEXT:    [[ADD7:%.*]] = add i32 [[TMP9]], 1
 // CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP10]], 5
+// CHECK2-NEXT:    [[ADD8:%.*]] = add i32 [[TMP10]], 5
 // CHECK2-NEXT:    [[CMP9:%.*]] = icmp ult i32 [[ADD7]], [[ADD8]]
 // CHECK2-NEXT:    br i1 [[CMP9]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK2:       cond.true:
@@ -2220,7 +2638,7 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    br label [[COND_END:%.*]]
 // CHECK2:       cond.false:
 // CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP12]], 5
+// CHECK2-NEXT:    [[ADD11:%.*]] = add i32 [[TMP12]], 5
 // CHECK2-NEXT:    br label [[COND_END]]
 // CHECK2:       cond.end:
 // CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ [[ADD10]], [[COND_TRUE]] ], [ [[ADD11]], [[COND_FALSE]] ]
@@ -2237,23 +2655,74 @@ extern "C" void foo8(int a) {
 // CHECK2-NEXT:    br label [[FOR_INC:%.*]]
 // CHECK2:       for.inc:
 // CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP16]], 1
+// CHECK2-NEXT:    [[INC:%.*]] = add i32 [[TMP16]], 1
 // CHECK2-NEXT:    store i32 [[INC]], ptr [[DOTTILE_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND6]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND6]], !llvm.loop [[LOOP27:![0-9]+]]
 // CHECK2:       for.end:
 // CHECK2-NEXT:    br label [[FOR_INC15:%.*]]
 // CHECK2:       for.inc15:
 // CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP17]], 5
+// CHECK2-NEXT:    [[ADD16:%.*]] = add i32 [[TMP17]], 5
 // CHECK2-NEXT:    store i32 [[ADD16]], ptr [[DOTFLOOR_0_IV_I]], align 4
-// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
+// CHECK2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
 // CHECK2:       for.end17:
 // CHECK2-NEXT:    ret void
 //
 //
-// CHECK2-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_tile_codegen.cpp
-// CHECK2-SAME: () #[[ATTR0]] section ".text.startup" {
+// CHECK2-LABEL: define internal void @_GLOBAL__sub_I_tile_codegen.cpp(
+// CHECK2-SAME: ) #[[ATTR0]] section ".text.startup" {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    call void @__cxx_global_var_init()
 // CHECK2-NEXT:    ret void
 //
+//.
+// CHECK1: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
+// CHECK1: [[META4]] = !{!"llvm.loop.mustprogress"}
+// CHECK1: [[LOOP5]] = distinct !{[[LOOP5]], [[META4]]}
+// CHECK1: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]}
+// CHECK1: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]}
+// CHECK1: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]}
+// CHECK1: [[LOOP9]] = distinct !{[[LOOP9]], [[META4]]}
+// CHECK1: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]]}
+// CHECK1: [[LOOP11]] = distinct !{[[LOOP11]], [[META4]]}
+// CHECK1: [[LOOP12]] = distinct !{[[LOOP12]], [[META4]]}
+// CHECK1: [[LOOP13]] = distinct !{[[LOOP13]], [[META4]]}
+// CHECK1: [[LOOP14]] = distinct !{[[LOOP14]], [[META4]]}
+// CHECK1: [[LOOP15]] = distinct !{[[LOOP15]], [[META4]]}
+// CHECK1: [[LOOP16]] = distinct !{[[LOOP16]], [[META4]]}
+// CHECK1: [[LOOP17]] = distinct !{[[LOOP17]], [[META4]]}
+// CHECK1: [[LOOP18]] = distinct !{[[LOOP18]], [[META4]]}
+// CHECK1: [[LOOP21]] = distinct !{[[LOOP21]], [[META4]]}
+// CHECK1: [[LOOP22]] = distinct !{[[LOOP22]], [[META4]]}
+// CHECK1: [[LOOP23]] = distinct !{[[LOOP23]], [[META4]]}
+// CHECK1: [[LOOP24]] = distinct !{[[LOOP24]], [[META4]]}
+// CHECK1: [[LOOP25]] = distinct !{[[LOOP25]], [[META4]]}
+// CHECK1: [[LOOP26]] = distinct !{[[LOOP26]], [[META4]]}
+// CHECK1: [[LOOP27]] = distinct !{[[LOOP27]], [[META4]]}
+// CHECK1: [[LOOP28]] = distinct !{[[LOOP28]], [[META4]]}
+//.
+// CHECK2: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
+// CHECK2: [[META4]] = !{!"llvm.loop.mustprogress"}
+// CHECK2: [[LOOP5]] = distinct !{[[LOOP5]], [[META4]]}
+// CHECK2: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]}
+// CHECK2: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]}
+// CHECK2: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]}
+// CHECK2: [[LOOP9]] = distinct !{[[LOOP9]], [[META4]]}
+// CHECK2: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]]}
+// CHECK2: [[LOOP11]] = distinct !{[[LOOP11]], [[META4]]}
+// CHECK2: [[LOOP12]] = distinct !{[[LOOP12]], [[META4]]}
+// CHECK2: [[LOOP13]] = distinct !{[[LOOP13]], [[META4]]}
+// CHECK2: [[LOOP14]] = distinct !{[[LOOP14]], [[META4]]}
+// CHECK2: [[LOOP15]] = distinct !{[[LOOP15]], [[META4]]}
+// CHECK2: [[LOOP16]] = distinct !{[[LOOP16]], [[META4]]}
+// CHECK2: [[LOOP17]] = distinct !{[[LOOP17]], [[META4]]}
+// CHECK2: [[LOOP18]] = distinct !{[[LOOP18]], [[META4]]}
+// CHECK2: [[LOOP19]] = distinct !{[[LOOP19]], [[META4]]}
+// CHECK2: [[LOOP20]] = distinct !{[[LOOP20]], [[META4]]}
+// CHECK2: [[LOOP23]] = distinct !{[[LOOP23]], [[META4]]}
+// CHECK2: [[LOOP24]] = distinct !{[[LOOP24]], [[META4]]}
+// CHECK2: [[LOOP25]] = distinct !{[[LOOP25]], [[META4]]}
+// CHECK2: [[LOOP26]] = distinct !{[[LOOP26]], [[META4]]}
+// CHECK2: [[LOOP27]] = distinct !{[[LOOP27]], [[META4]]}
+// CHECK2: [[LOOP28]] = distinct !{[[LOOP28]], [[META4]]}
+//.
diff --git a/clang/test/OpenMP/tile_codegen_for_dependent.cpp b/clang/test/OpenMP/tile_codegen_for_dependent.cpp
index 93c51c9165a4..820d33d15287 100644
--- a/clang/test/OpenMP/tile_codegen_for_dependent.cpp
+++ b/clang/test/OpenMP/tile_codegen_for_dependent.cpp
@@ -17,7 +17,7 @@
 extern "C" void body(...) {}
 
 
-// IR-LABEL: @func(
+// IR-LABEL: define {{.*}}@func(
 // IR-NEXT:  [[ENTRY:.*]]:
 // IR-NEXT:    %[[START_ADDR:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[END_ADDR:.+]] = alloca i32, align 4
@@ -27,18 +27,18 @@ extern "C" void body(...) {}
 // IR-NEXT:    %[[I:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTCAPTURE_EXPR_:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTCAPTURE_EXPR_1:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTNEW_STEP:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTCAPTURE_EXPR_2:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[DOTCAPTURE_EXPR_3:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[DOTCAPTURE_EXPR_6:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[DOTCAPTURE_EXPR_8:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_5:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_7:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTFLOOR_0_IV_I:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTOMP_LB:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTOMP_UB:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTOMP_STRIDE:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTOMP_IS_LAST:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[DOTFLOOR_0_IV_I12:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTFLOOR_0_IV_I11:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTTILE_0_IV_I:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[TMP0:.+]] = call i32 @__kmpc_global_thread_num(ptr @2)
+// IR-NEXT:    %[[TMP0:.+]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:.+]])
 // IR-NEXT:    store i32 %[[START:.+]], ptr %[[START_ADDR]], align 4
 // IR-NEXT:    store i32 %[[END:.+]], ptr %[[END_ADDR]], align 4
 // IR-NEXT:    store i32 %[[STEP:.+]], ptr %[[STEP_ADDR]], align 4
@@ -49,44 +49,44 @@ extern "C" void body(...) {}
 // IR-NEXT:    %[[TMP3:.+]] = load i32, ptr %[[END_ADDR]], align 4
 // IR-NEXT:    store i32 %[[TMP3]], ptr %[[DOTCAPTURE_EXPR_1]], align 4
 // IR-NEXT:    %[[TMP4:.+]] = load i32, ptr %[[STEP_ADDR]], align 4
-// IR-NEXT:    store i32 %[[TMP4]], ptr %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    store i32 %[[TMP4]], ptr %[[DOTNEW_STEP]], align 4
 // IR-NEXT:    %[[TMP5:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_1]], align 4
 // IR-NEXT:    %[[TMP6:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_]], align 4
 // IR-NEXT:    %[[SUB:.+]] = sub i32 %[[TMP5]], %[[TMP6]]
-// IR-NEXT:    %[[SUB4:.+]] = sub i32 %[[SUB]], 1
-// IR-NEXT:    %[[TMP7:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
-// IR-NEXT:    %[[ADD:.+]] = add i32 %[[SUB4]], %[[TMP7]]
-// IR-NEXT:    %[[TMP8:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[SUB3:.+]] = sub i32 %[[SUB]], 1
+// IR-NEXT:    %[[TMP7:.+]] = load i32, ptr %[[DOTNEW_STEP]], align 4
+// IR-NEXT:    %[[ADD:.+]] = add i32 %[[SUB3]], %[[TMP7]]
+// IR-NEXT:    %[[TMP8:.+]] = load i32, ptr %[[DOTNEW_STEP]], align 4
 // IR-NEXT:    %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP8]]
-// IR-NEXT:    %[[SUB5:.+]] = sub i32 %[[DIV]], 1
-// IR-NEXT:    store i32 %[[SUB5]], ptr %[[DOTCAPTURE_EXPR_3]], align 4
-// IR-NEXT:    %[[TMP9:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_3]], align 4
-// IR-NEXT:    %[[ADD7:.+]] = add i32 %[[TMP9]], 1
-// IR-NEXT:    store i32 %[[ADD7]], ptr %[[DOTCAPTURE_EXPR_6]], align 4
-// IR-NEXT:    %[[TMP10:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_6]], align 4
-// IR-NEXT:    %[[SUB9:.+]] = sub i32 %[[TMP10]], -3
-// IR-NEXT:    %[[DIV10:.+]] = udiv i32 %[[SUB9]], 4
-// IR-NEXT:    %[[SUB11:.+]] = sub i32 %[[DIV10]], 1
-// IR-NEXT:    store i32 %[[SUB11]], ptr %[[DOTCAPTURE_EXPR_8]], align 4
+// IR-NEXT:    %[[SUB4:.+]] = sub i32 %[[DIV]], 1
+// IR-NEXT:    store i32 %[[SUB4]], ptr %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[TMP9:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[ADD6:.+]] = add i32 %[[TMP9]], 1
+// IR-NEXT:    store i32 %[[ADD6]], ptr %[[DOTCAPTURE_EXPR_5]], align 4
+// IR-NEXT:    %[[TMP10:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_5]], align 4
+// IR-NEXT:    %[[SUB8:.+]] = sub i32 %[[TMP10]], -3
+// IR-NEXT:    %[[DIV9:.+]] = udiv i32 %[[SUB8]], 4
+// IR-NEXT:    %[[SUB10:.+]] = sub i32 %[[DIV9]], 1
+// IR-NEXT:    store i32 %[[SUB10]], ptr %[[DOTCAPTURE_EXPR_7]], align 4
 // IR-NEXT:    store i32 0, ptr %[[DOTFLOOR_0_IV_I]], align 4
-// IR-NEXT:    %[[TMP11:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_6]], align 4
+// IR-NEXT:    %[[TMP11:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_5]], align 4
 // IR-NEXT:    %[[CMP:.+]] = icmp ult i32 0, %[[TMP11]]
 // IR-NEXT:    br i1 %[[CMP]], label %[[OMP_PRECOND_THEN:.+]], label %[[OMP_PRECOND_END:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_PRECOND_THEN]]:
 // IR-NEXT:    store i32 0, ptr %[[DOTOMP_LB]], align 4
-// IR-NEXT:    %[[TMP12:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_8]], align 4
+// IR-NEXT:    %[[TMP12:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_7]], align 4
 // IR-NEXT:    store i32 %[[TMP12]], ptr %[[DOTOMP_UB]], align 4
 // IR-NEXT:    store i32 1, ptr %[[DOTOMP_STRIDE]], align 4
 // IR-NEXT:    store i32 0, ptr %[[DOTOMP_IS_LAST]], align 4
-// IR-NEXT:    call void @__kmpc_for_static_init_4u(ptr @1, i32 %[[TMP0]], i32 34, ptr %[[DOTOMP_IS_LAST]], ptr %[[DOTOMP_LB]], ptr %[[DOTOMP_UB]], ptr %[[DOTOMP_STRIDE]], i32 1, i32 1)
+// IR-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB1:.+]], i32 %[[TMP0]], i32 34, ptr %[[DOTOMP_IS_LAST]], ptr %[[DOTOMP_LB]], ptr %[[DOTOMP_UB]], ptr %[[DOTOMP_STRIDE]], i32 1, i32 1)
 // IR-NEXT:    %[[TMP13:.+]] = load i32, ptr %[[DOTOMP_UB]], align 4
-// IR-NEXT:    %[[TMP14:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_8]], align 4
-// IR-NEXT:    %[[CMP13:.+]] = icmp ugt i32 %[[TMP13]], %[[TMP14]]
-// IR-NEXT:    br i1 %[[CMP13]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]]
+// IR-NEXT:    %[[TMP14:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_7]], align 4
+// IR-NEXT:    %[[CMP12:.+]] = icmp ugt i32 %[[TMP13]], %[[TMP14]]
+// IR-NEXT:    br i1 %[[CMP12]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[COND_TRUE]]:
-// IR-NEXT:    %[[TMP15:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_8]], align 4
+// IR-NEXT:    %[[TMP15:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_7]], align 4
 // IR-NEXT:    br label %[[COND_END:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[COND_FALSE]]:
@@ -103,50 +103,50 @@ extern "C" void body(...) {}
 // IR-NEXT:  [[OMP_INNER_FOR_COND]]:
 // IR-NEXT:    %[[TMP18:.+]] = load i32, ptr %[[DOTOMP_IV]], align 4
 // IR-NEXT:    %[[TMP19:.+]] = load i32, ptr %[[DOTOMP_UB]], align 4
-// IR-NEXT:    %[[ADD14:.+]] = add i32 %[[TMP19]], 1
-// IR-NEXT:    %[[CMP15:.+]] = icmp ult i32 %[[TMP18]], %[[ADD14]]
-// IR-NEXT:    br i1 %[[CMP15]], label %[[OMP_INNER_FOR_BODY:.+]], label %[[OMP_INNER_FOR_END:.+]]
+// IR-NEXT:    %[[ADD13:.+]] = add i32 %[[TMP19]], 1
+// IR-NEXT:    %[[CMP14:.+]] = icmp ult i32 %[[TMP18]], %[[ADD13]]
+// IR-NEXT:    br i1 %[[CMP14]], label %[[OMP_INNER_FOR_BODY:.+]], label %[[OMP_INNER_FOR_END:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_INNER_FOR_BODY]]:
 // IR-NEXT:    %[[TMP20:.+]] = load i32, ptr %[[DOTOMP_IV]], align 4
 // IR-NEXT:    %[[MUL:.+]] = mul i32 %[[TMP20]], 4
-// IR-NEXT:    %[[ADD16:.+]] = add i32 0, %[[MUL]]
-// IR-NEXT:    store i32 %[[ADD16]], ptr %[[DOTFLOOR_0_IV_I12]], align 4
-// IR-NEXT:    %[[TMP21:.+]] = load i32, ptr %[[DOTFLOOR_0_IV_I12]], align 4
+// IR-NEXT:    %[[ADD15:.+]] = add i32 0, %[[MUL]]
+// IR-NEXT:    store i32 %[[ADD15]], ptr %[[DOTFLOOR_0_IV_I11]], align 4
+// IR-NEXT:    %[[TMP21:.+]] = load i32, ptr %[[DOTFLOOR_0_IV_I11]], align 4
 // IR-NEXT:    store i32 %[[TMP21]], ptr %[[DOTTILE_0_IV_I]], align 4
 // IR-NEXT:    br label %[[FOR_COND:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[FOR_COND]]:
 // IR-NEXT:    %[[TMP22:.+]] = load i32, ptr %[[DOTTILE_0_IV_I]], align 4
-// IR-NEXT:    %[[TMP23:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_3]], align 4
-// IR-NEXT:    %[[ADD17:.+]] = add i32 %[[TMP23]], 1
-// IR-NEXT:    %[[TMP24:.+]] = load i32, ptr %[[DOTFLOOR_0_IV_I12]], align 4
-// IR-NEXT:    %[[ADD18:.+]] = add nsw i32 %[[TMP24]], 4
-// IR-NEXT:    %[[CMP19:.+]] = icmp ult i32 %[[ADD17]], %[[ADD18]]
-// IR-NEXT:    br i1 %[[CMP19]], label %[[COND_TRUE20:.+]], label %[[COND_FALSE22:.+]]
-// IR-EMPTY:
-// IR-NEXT:  [[COND_TRUE20]]:
-// IR-NEXT:    %[[TMP25:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_3]], align 4
-// IR-NEXT:    %[[ADD21:.+]] = add i32 %[[TMP25]], 1
-// IR-NEXT:    br label %[[COND_END24:.+]]
-// IR-EMPTY:
-// IR-NEXT:  [[COND_FALSE22]]:
-// IR-NEXT:    %[[TMP26:.+]] = load i32, ptr %[[DOTFLOOR_0_IV_I12]], align 4
-// IR-NEXT:    %[[ADD23:.+]] = add nsw i32 %[[TMP26]], 4
-// IR-NEXT:    br label %[[COND_END24]]
-// IR-EMPTY:
-// IR-NEXT:  [[COND_END24]]:
-// IR-NEXT:    %[[COND25:.+]] = phi i32 [ %[[ADD21]], %[[COND_TRUE20]] ], [ %[[ADD23]], %[[COND_FALSE22]] ]
-// IR-NEXT:    %[[CMP26:.+]] = icmp ult i32 %[[TMP22]], %[[COND25]]
-// IR-NEXT:    br i1 %[[CMP26]], label %[[FOR_BODY:.+]], label %[[FOR_END:.+]]
+// IR-NEXT:    %[[TMP23:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[ADD16:.+]] = add i32 %[[TMP23]], 1
+// IR-NEXT:    %[[TMP24:.+]] = load i32, ptr %[[DOTFLOOR_0_IV_I11]], align 4
+// IR-NEXT:    %[[ADD17:.+]] = add i32 %[[TMP24]], 4
+// IR-NEXT:    %[[CMP18:.+]] = icmp ult i32 %[[ADD16]], %[[ADD17]]
+// IR-NEXT:    br i1 %[[CMP18]], label %[[COND_TRUE19:.+]], label %[[COND_FALSE21:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_TRUE19]]:
+// IR-NEXT:    %[[TMP25:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[ADD20:.+]] = add i32 %[[TMP25]], 1
+// IR-NEXT:    br label %[[COND_END23:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_FALSE21]]:
+// IR-NEXT:    %[[TMP26:.+]] = load i32, ptr %[[DOTFLOOR_0_IV_I11]], align 4
+// IR-NEXT:    %[[ADD22:.+]] = add i32 %[[TMP26]], 4
+// IR-NEXT:    br label %[[COND_END23]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_END23]]:
+// IR-NEXT:    %[[COND24:.+]] = phi i32 [ %[[ADD20]], %[[COND_TRUE19]] ], [ %[[ADD22]], %[[COND_FALSE21]] ]
+// IR-NEXT:    %[[CMP25:.+]] = icmp ult i32 %[[TMP22]], %[[COND24]]
+// IR-NEXT:    br i1 %[[CMP25]], label %[[FOR_BODY:.+]], label %[[FOR_END:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[FOR_BODY]]:
 // IR-NEXT:    %[[TMP27:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_]], align 4
 // IR-NEXT:    %[[TMP28:.+]] = load i32, ptr %[[DOTTILE_0_IV_I]], align 4
-// IR-NEXT:    %[[TMP29:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
-// IR-NEXT:    %[[MUL27:.+]] = mul i32 %[[TMP28]], %[[TMP29]]
-// IR-NEXT:    %[[ADD28:.+]] = add i32 %[[TMP27]], %[[MUL27]]
-// IR-NEXT:    store i32 %[[ADD28]], ptr %[[I]], align 4
+// IR-NEXT:    %[[TMP29:.+]] = load i32, ptr %[[DOTNEW_STEP]], align 4
+// IR-NEXT:    %[[MUL26:.+]] = mul i32 %[[TMP28]], %[[TMP29]]
+// IR-NEXT:    %[[ADD27:.+]] = add i32 %[[TMP27]], %[[MUL26]]
+// IR-NEXT:    store i32 %[[ADD27]], ptr %[[I]], align 4
 // IR-NEXT:    %[[TMP30:.+]] = load i32, ptr %[[START_ADDR]], align 4
 // IR-NEXT:    %[[TMP31:.+]] = load i32, ptr %[[END_ADDR]], align 4
 // IR-NEXT:    %[[TMP32:.+]] = load i32, ptr %[[STEP_ADDR]], align 4
@@ -156,9 +156,9 @@ extern "C" void body(...) {}
 // IR-EMPTY:
 // IR-NEXT:  [[FOR_INC]]:
 // IR-NEXT:    %[[TMP34:.+]] = load i32, ptr %[[DOTTILE_0_IV_I]], align 4
-// IR-NEXT:    %[[INC:.+]] = add nsw i32 %[[TMP34]], 1
+// IR-NEXT:    %[[INC:.+]] = add i32 %[[TMP34]], 1
 // IR-NEXT:    store i32 %[[INC]], ptr %[[DOTTILE_0_IV_I]], align 4
-// IR-NEXT:    br label %[[FOR_COND]], !llvm.loop ![[LOOP2:[0-9]+]]
+// IR-NEXT:    br label %[[FOR_COND]], !llvm.loop ![[LOOP3:[0-9]+]]
 // IR-EMPTY:
 // IR-NEXT:  [[FOR_END]]:
 // IR-NEXT:    br label %[[OMP_BODY_CONTINUE:.+]]
@@ -168,19 +168,19 @@ extern "C" void body(...) {}
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_INNER_FOR_INC]]:
 // IR-NEXT:    %[[TMP35:.+]] = load i32, ptr %[[DOTOMP_IV]], align 4
-// IR-NEXT:    %[[ADD29:.+]] = add i32 %[[TMP35]], 1
-// IR-NEXT:    store i32 %[[ADD29]], ptr %[[DOTOMP_IV]], align 4
+// IR-NEXT:    %[[ADD28:.+]] = add i32 %[[TMP35]], 1
+// IR-NEXT:    store i32 %[[ADD28]], ptr %[[DOTOMP_IV]], align 4
 // IR-NEXT:    br label %[[OMP_INNER_FOR_COND]]
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_INNER_FOR_END]]:
 // IR-NEXT:    br label %[[OMP_LOOP_EXIT:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_LOOP_EXIT]]:
-// IR-NEXT:    call void @__kmpc_for_static_fini(ptr @1, i32 %[[TMP0]])
+// IR-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 %[[TMP0]])
 // IR-NEXT:    br label %[[OMP_PRECOND_END]]
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_PRECOND_END]]:
-// IR-NEXT:    call void @__kmpc_barrier(ptr @3, i32 %[[TMP0]])
+// IR-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB3:.+]], i32 %[[TMP0]])
 // IR-NEXT:    ret void
 // IR-NEXT:  }
 extern "C" void func(int start, int end, int step) {
diff --git a/clang/test/OpenMP/tile_codegen_tile_for.cpp b/clang/test/OpenMP/tile_codegen_tile_for.cpp
index d0fb89398c24..91536c406368 100644
--- a/clang/test/OpenMP/tile_codegen_tile_for.cpp
+++ b/clang/test/OpenMP/tile_codegen_tile_for.cpp
@@ -16,7 +16,7 @@
 extern "C" void body(...) {}
 
 
-// IR-LABEL: @func(
+// IR-LABEL: define {{.*}}@func(
 // IR-NEXT:  [[ENTRY:.*]]:
 // IR-NEXT:    %[[START_ADDR:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[END_ADDR:.+]] = alloca i32, align 4
@@ -26,22 +26,22 @@ extern "C" void body(...) {}
 // IR-NEXT:    %[[I:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTCAPTURE_EXPR_:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTCAPTURE_EXPR_1:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTNEW_STEP:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTCAPTURE_EXPR_2:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[DOTCAPTURE_EXPR_3:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTFLOOR_0_IV_I:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[DOTCAPTURE_EXPR_6:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[DOTCAPTURE_EXPR_8:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[DOTCAPTURE_EXPR_12:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[DOTCAPTURE_EXPR_14:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_5:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_7:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_11:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTCAPTURE_EXPR_13:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTFLOOR_0_IV__FLOOR_0_IV_I:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTOMP_LB:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTOMP_UB:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTOMP_STRIDE:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTOMP_IS_LAST:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18:.+]] = alloca i32, align 4
+// IR-NEXT:    %[[DOTFLOOR_0_IV__FLOOR_0_IV_I17:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTTILE_0_IV__FLOOR_0_IV_I:.+]] = alloca i32, align 4
 // IR-NEXT:    %[[DOTTILE_0_IV_I:.+]] = alloca i32, align 4
-// IR-NEXT:    %[[TMP0:.+]] = call i32 @__kmpc_global_thread_num(ptr @2)
+// IR-NEXT:    %[[TMP0:.+]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:.+]])
 // IR-NEXT:    store i32 %[[START:.+]], ptr %[[START_ADDR]], align 4
 // IR-NEXT:    store i32 %[[END:.+]], ptr %[[END_ADDR]], align 4
 // IR-NEXT:    store i32 %[[STEP:.+]], ptr %[[STEP_ADDR]], align 4
@@ -52,53 +52,53 @@ extern "C" void body(...) {}
 // IR-NEXT:    %[[TMP3:.+]] = load i32, ptr %[[END_ADDR]], align 4
 // IR-NEXT:    store i32 %[[TMP3]], ptr %[[DOTCAPTURE_EXPR_1]], align 4
 // IR-NEXT:    %[[TMP4:.+]] = load i32, ptr %[[STEP_ADDR]], align 4
-// IR-NEXT:    store i32 %[[TMP4]], ptr %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    store i32 %[[TMP4]], ptr %[[DOTNEW_STEP]], align 4
 // IR-NEXT:    %[[TMP5:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_1]], align 4
 // IR-NEXT:    %[[TMP6:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_]], align 4
 // IR-NEXT:    %[[SUB:.+]] = sub i32 %[[TMP5]], %[[TMP6]]
-// IR-NEXT:    %[[SUB4:.+]] = sub i32 %[[SUB]], 1
-// IR-NEXT:    %[[TMP7:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
-// IR-NEXT:    %[[ADD:.+]] = add i32 %[[SUB4]], %[[TMP7]]
-// IR-NEXT:    %[[TMP8:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[SUB3:.+]] = sub i32 %[[SUB]], 1
+// IR-NEXT:    %[[TMP7:.+]] = load i32, ptr %[[DOTNEW_STEP]], align 4
+// IR-NEXT:    %[[ADD:.+]] = add i32 %[[SUB3]], %[[TMP7]]
+// IR-NEXT:    %[[TMP8:.+]] = load i32, ptr %[[DOTNEW_STEP]], align 4
 // IR-NEXT:    %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP8]]
-// IR-NEXT:    %[[SUB5:.+]] = sub i32 %[[DIV]], 1
-// IR-NEXT:    store i32 %[[SUB5]], ptr %[[DOTCAPTURE_EXPR_3]], align 4
+// IR-NEXT:    %[[SUB4:.+]] = sub i32 %[[DIV]], 1
+// IR-NEXT:    store i32 %[[SUB4]], ptr %[[DOTCAPTURE_EXPR_2]], align 4
 // IR-NEXT:    store i32 0, ptr %[[DOTFLOOR_0_IV_I]], align 4
-// IR-NEXT:    %[[TMP9:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_3]], align 4
-// IR-NEXT:    %[[ADD7:.+]] = add i32 %[[TMP9]], 1
-// IR-NEXT:    store i32 %[[ADD7]], ptr %[[DOTCAPTURE_EXPR_6]], align 4
-// IR-NEXT:    %[[TMP10:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_6]], align 4
-// IR-NEXT:    %[[SUB9:.+]] = sub i32 %[[TMP10]], -3
-// IR-NEXT:    %[[DIV10:.+]] = udiv i32 %[[SUB9]], 4
-// IR-NEXT:    %[[SUB11:.+]] = sub i32 %[[DIV10]], 1
-// IR-NEXT:    store i32 %[[SUB11]], ptr %[[DOTCAPTURE_EXPR_8]], align 4
-// IR-NEXT:    %[[TMP11:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_8]], align 4
-// IR-NEXT:    %[[ADD13:.+]] = add i32 %[[TMP11]], 1
-// IR-NEXT:    store i32 %[[ADD13]], ptr %[[DOTCAPTURE_EXPR_12]], align 4
-// IR-NEXT:    %[[TMP12:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_12]], align 4
-// IR-NEXT:    %[[SUB15:.+]] = sub i32 %[[TMP12]], -2
-// IR-NEXT:    %[[DIV16:.+]] = udiv i32 %[[SUB15]], 3
-// IR-NEXT:    %[[SUB17:.+]] = sub i32 %[[DIV16]], 1
-// IR-NEXT:    store i32 %[[SUB17]], ptr %[[DOTCAPTURE_EXPR_14]], align 4
+// IR-NEXT:    %[[TMP9:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[ADD6:.+]] = add i32 %[[TMP9]], 1
+// IR-NEXT:    store i32 %[[ADD6]], ptr %[[DOTCAPTURE_EXPR_5]], align 4
+// IR-NEXT:    %[[TMP10:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_5]], align 4
+// IR-NEXT:    %[[SUB8:.+]] = sub i32 %[[TMP10]], -3
+// IR-NEXT:    %[[DIV9:.+]] = udiv i32 %[[SUB8]], 4
+// IR-NEXT:    %[[SUB10:.+]] = sub i32 %[[DIV9]], 1
+// IR-NEXT:    store i32 %[[SUB10]], ptr %[[DOTCAPTURE_EXPR_7]], align 4
+// IR-NEXT:    %[[TMP11:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_7]], align 4
+// IR-NEXT:    %[[ADD12:.+]] = add i32 %[[TMP11]], 1
+// IR-NEXT:    store i32 %[[ADD12]], ptr %[[DOTCAPTURE_EXPR_11]], align 4
+// IR-NEXT:    %[[TMP12:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_11]], align 4
+// IR-NEXT:    %[[SUB14:.+]] = sub i32 %[[TMP12]], -2
+// IR-NEXT:    %[[DIV15:.+]] = udiv i32 %[[SUB14]], 3
+// IR-NEXT:    %[[SUB16:.+]] = sub i32 %[[DIV15]], 1
+// IR-NEXT:    store i32 %[[SUB16]], ptr %[[DOTCAPTURE_EXPR_13]], align 4
 // IR-NEXT:    store i32 0, ptr %[[DOTFLOOR_0_IV__FLOOR_0_IV_I]], align 4
-// IR-NEXT:    %[[TMP13:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_12]], align 4
+// IR-NEXT:    %[[TMP13:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_11]], align 4
 // IR-NEXT:    %[[CMP:.+]] = icmp ult i32 0, %[[TMP13]]
 // IR-NEXT:    br i1 %[[CMP]], label %[[OMP_PRECOND_THEN:.+]], label %[[OMP_PRECOND_END:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_PRECOND_THEN]]:
 // IR-NEXT:    store i32 0, ptr %[[DOTOMP_LB]], align 4
-// IR-NEXT:    %[[TMP14:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_14]], align 4
+// IR-NEXT:    %[[TMP14:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_13]], align 4
 // IR-NEXT:    store i32 %[[TMP14]], ptr %[[DOTOMP_UB]], align 4
 // IR-NEXT:    store i32 1, ptr %[[DOTOMP_STRIDE]], align 4
 // IR-NEXT:    store i32 0, ptr %[[DOTOMP_IS_LAST]], align 4
-// IR-NEXT:    call void @__kmpc_for_static_init_4u(ptr @1, i32 %[[TMP0]], i32 34, ptr %[[DOTOMP_IS_LAST]], ptr %[[DOTOMP_LB]], ptr %[[DOTOMP_UB]], ptr %[[DOTOMP_STRIDE]], i32 1, i32 1)
+// IR-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB1:.+]], i32 %[[TMP0]], i32 34, ptr %[[DOTOMP_IS_LAST]], ptr %[[DOTOMP_LB]], ptr %[[DOTOMP_UB]], ptr %[[DOTOMP_STRIDE]], i32 1, i32 1)
 // IR-NEXT:    %[[TMP15:.+]] = load i32, ptr %[[DOTOMP_UB]], align 4
-// IR-NEXT:    %[[TMP16:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_14]], align 4
-// IR-NEXT:    %[[CMP19:.+]] = icmp ugt i32 %[[TMP15]], %[[TMP16]]
-// IR-NEXT:    br i1 %[[CMP19]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]]
+// IR-NEXT:    %[[TMP16:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_13]], align 4
+// IR-NEXT:    %[[CMP18:.+]] = icmp ugt i32 %[[TMP15]], %[[TMP16]]
+// IR-NEXT:    br i1 %[[CMP18]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[COND_TRUE]]:
-// IR-NEXT:    %[[TMP17:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_14]], align 4
+// IR-NEXT:    %[[TMP17:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_13]], align 4
 // IR-NEXT:    br label %[[COND_END:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[COND_FALSE]]:
@@ -115,83 +115,83 @@ extern "C" void body(...) {}
 // IR-NEXT:  [[OMP_INNER_FOR_COND]]:
 // IR-NEXT:    %[[TMP20:.+]] = load i32, ptr %[[DOTOMP_IV]], align 4
 // IR-NEXT:    %[[TMP21:.+]] = load i32, ptr %[[DOTOMP_UB]], align 4
-// IR-NEXT:    %[[ADD20:.+]] = add i32 %[[TMP21]], 1
-// IR-NEXT:    %[[CMP21:.+]] = icmp ult i32 %[[TMP20]], %[[ADD20]]
-// IR-NEXT:    br i1 %[[CMP21]], label %[[OMP_INNER_FOR_BODY:.+]], label %[[OMP_INNER_FOR_END:.+]]
+// IR-NEXT:    %[[ADD19:.+]] = add i32 %[[TMP21]], 1
+// IR-NEXT:    %[[CMP20:.+]] = icmp ult i32 %[[TMP20]], %[[ADD19]]
+// IR-NEXT:    br i1 %[[CMP20]], label %[[OMP_INNER_FOR_BODY:.+]], label %[[OMP_INNER_FOR_END:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_INNER_FOR_BODY]]:
 // IR-NEXT:    %[[TMP22:.+]] = load i32, ptr %[[DOTOMP_IV]], align 4
 // IR-NEXT:    %[[MUL:.+]] = mul i32 %[[TMP22]], 3
-// IR-NEXT:    %[[ADD22:.+]] = add i32 0, %[[MUL]]
-// IR-NEXT:    store i32 %[[ADD22]], ptr %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18]], align 4
-// IR-NEXT:    %[[TMP23:.+]] = load i32, ptr %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18]], align 4
+// IR-NEXT:    %[[ADD21:.+]] = add i32 0, %[[MUL]]
+// IR-NEXT:    store i32 %[[ADD21]], ptr %[[DOTFLOOR_0_IV__FLOOR_0_IV_I17]], align 4
+// IR-NEXT:    %[[TMP23:.+]] = load i32, ptr %[[DOTFLOOR_0_IV__FLOOR_0_IV_I17]], align 4
 // IR-NEXT:    store i32 %[[TMP23]], ptr %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4
 // IR-NEXT:    br label %[[FOR_COND:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[FOR_COND]]:
 // IR-NEXT:    %[[TMP24:.+]] = load i32, ptr %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4
-// IR-NEXT:    %[[TMP25:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_8]], align 4
-// IR-NEXT:    %[[ADD23:.+]] = add i32 %[[TMP25]], 1
-// IR-NEXT:    %[[TMP26:.+]] = load i32, ptr %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18]], align 4
-// IR-NEXT:    %[[ADD24:.+]] = add i32 %[[TMP26]], 3
-// IR-NEXT:    %[[CMP25:.+]] = icmp ult i32 %[[ADD23]], %[[ADD24]]
-// IR-NEXT:    br i1 %[[CMP25]], label %[[COND_TRUE26:.+]], label %[[COND_FALSE28:.+]]
-// IR-EMPTY:
-// IR-NEXT:  [[COND_TRUE26]]:
-// IR-NEXT:    %[[TMP27:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_8]], align 4
-// IR-NEXT:    %[[ADD27:.+]] = add i32 %[[TMP27]], 1
-// IR-NEXT:    br label %[[COND_END30:.+]]
-// IR-EMPTY:
-// IR-NEXT:  [[COND_FALSE28]]:
-// IR-NEXT:    %[[TMP28:.+]] = load i32, ptr %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18]], align 4
-// IR-NEXT:    %[[ADD29:.+]] = add i32 %[[TMP28]], 3
-// IR-NEXT:    br label %[[COND_END30]]
-// IR-EMPTY:
-// IR-NEXT:  [[COND_END30]]:
-// IR-NEXT:    %[[COND31:.+]] = phi i32 [ %[[ADD27]], %[[COND_TRUE26]] ], [ %[[ADD29]], %[[COND_FALSE28]] ]
-// IR-NEXT:    %[[CMP32:.+]] = icmp ult i32 %[[TMP24]], %[[COND31]]
-// IR-NEXT:    br i1 %[[CMP32]], label %[[FOR_BODY:.+]], label %[[FOR_END51:.+]]
+// IR-NEXT:    %[[TMP25:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_7]], align 4
+// IR-NEXT:    %[[ADD22:.+]] = add i32 %[[TMP25]], 1
+// IR-NEXT:    %[[TMP26:.+]] = load i32, ptr %[[DOTFLOOR_0_IV__FLOOR_0_IV_I17]], align 4
+// IR-NEXT:    %[[ADD23:.+]] = add i32 %[[TMP26]], 3
+// IR-NEXT:    %[[CMP24:.+]] = icmp ult i32 %[[ADD22]], %[[ADD23]]
+// IR-NEXT:    br i1 %[[CMP24]], label %[[COND_TRUE25:.+]], label %[[COND_FALSE27:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_TRUE25]]:
+// IR-NEXT:    %[[TMP27:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_7]], align 4
+// IR-NEXT:    %[[ADD26:.+]] = add i32 %[[TMP27]], 1
+// IR-NEXT:    br label %[[COND_END29:.+]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_FALSE27]]:
+// IR-NEXT:    %[[TMP28:.+]] = load i32, ptr %[[DOTFLOOR_0_IV__FLOOR_0_IV_I17]], align 4
+// IR-NEXT:    %[[ADD28:.+]] = add i32 %[[TMP28]], 3
+// IR-NEXT:    br label %[[COND_END29]]
+// IR-EMPTY:
+// IR-NEXT:  [[COND_END29]]:
+// IR-NEXT:    %[[COND30:.+]] = phi i32 [ %[[ADD26]], %[[COND_TRUE25]] ], [ %[[ADD28]], %[[COND_FALSE27]] ]
+// IR-NEXT:    %[[CMP31:.+]] = icmp ult i32 %[[TMP24]], %[[COND30]]
+// IR-NEXT:    br i1 %[[CMP31]], label %[[FOR_BODY:.+]], label %[[FOR_END50:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[FOR_BODY]]:
 // IR-NEXT:    %[[TMP29:.+]] = load i32, ptr %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4
-// IR-NEXT:    %[[MUL33:.+]] = mul i32 %[[TMP29]], 4
-// IR-NEXT:    %[[ADD34:.+]] = add i32 0, %[[MUL33]]
-// IR-NEXT:    store i32 %[[ADD34]], ptr %[[DOTFLOOR_0_IV_I]], align 4
+// IR-NEXT:    %[[MUL32:.+]] = mul i32 %[[TMP29]], 4
+// IR-NEXT:    %[[ADD33:.+]] = add i32 0, %[[MUL32]]
+// IR-NEXT:    store i32 %[[ADD33]], ptr %[[DOTFLOOR_0_IV_I]], align 4
 // IR-NEXT:    %[[TMP30:.+]] = load i32, ptr %[[DOTFLOOR_0_IV_I]], align 4
 // IR-NEXT:    store i32 %[[TMP30]], ptr %[[DOTTILE_0_IV_I]], align 4
-// IR-NEXT:    br label %[[FOR_COND35:.+]]
+// IR-NEXT:    br label %[[FOR_COND34:.+]]
 // IR-EMPTY:
-// IR-NEXT:  [[FOR_COND35]]:
+// IR-NEXT:  [[FOR_COND34]]:
 // IR-NEXT:    %[[TMP31:.+]] = load i32, ptr %[[DOTTILE_0_IV_I]], align 4
-// IR-NEXT:    %[[TMP32:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_3]], align 4
-// IR-NEXT:    %[[ADD36:.+]] = add i32 %[[TMP32]], 1
+// IR-NEXT:    %[[TMP32:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[ADD35:.+]] = add i32 %[[TMP32]], 1
 // IR-NEXT:    %[[TMP33:.+]] = load i32, ptr %[[DOTFLOOR_0_IV_I]], align 4
-// IR-NEXT:    %[[ADD37:.+]] = add nsw i32 %[[TMP33]], 4
-// IR-NEXT:    %[[CMP38:.+]] = icmp ult i32 %[[ADD36]], %[[ADD37]]
-// IR-NEXT:    br i1 %[[CMP38]], label %[[COND_TRUE39:.+]], label %[[COND_FALSE41:.+]]
+// IR-NEXT:    %[[ADD36:.+]] = add i32 %[[TMP33]], 4
+// IR-NEXT:    %[[CMP37:.+]] = icmp ult i32 %[[ADD35]], %[[ADD36]]
+// IR-NEXT:    br i1 %[[CMP37]], label %[[COND_TRUE38:.+]], label %[[COND_FALSE40:.+]]
 // IR-EMPTY:
-// IR-NEXT:  [[COND_TRUE39]]:
-// IR-NEXT:    %[[TMP34:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_3]], align 4
-// IR-NEXT:    %[[ADD40:.+]] = add i32 %[[TMP34]], 1
-// IR-NEXT:    br label %[[COND_END43:.+]]
+// IR-NEXT:  [[COND_TRUE38]]:
+// IR-NEXT:    %[[TMP34:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
+// IR-NEXT:    %[[ADD39:.+]] = add i32 %[[TMP34]], 1
+// IR-NEXT:    br label %[[COND_END42:.+]]
 // IR-EMPTY:
-// IR-NEXT:  [[COND_FALSE41]]:
+// IR-NEXT:  [[COND_FALSE40]]:
 // IR-NEXT:    %[[TMP35:.+]] = load i32, ptr %[[DOTFLOOR_0_IV_I]], align 4
-// IR-NEXT:    %[[ADD42:.+]] = add nsw i32 %[[TMP35]], 4
-// IR-NEXT:    br label %[[COND_END43]]
+// IR-NEXT:    %[[ADD41:.+]] = add i32 %[[TMP35]], 4
+// IR-NEXT:    br label %[[COND_END42]]
 // IR-EMPTY:
-// IR-NEXT:  [[COND_END43]]:
-// IR-NEXT:    %[[COND44:.+]] = phi i32 [ %[[ADD40]], %[[COND_TRUE39]] ], [ %[[ADD42]], %[[COND_FALSE41]] ]
-// IR-NEXT:    %[[CMP45:.+]] = icmp ult i32 %[[TMP31]], %[[COND44]]
-// IR-NEXT:    br i1 %[[CMP45]], label %[[FOR_BODY46:.+]], label %[[FOR_END:.+]]
+// IR-NEXT:  [[COND_END42]]:
+// IR-NEXT:    %[[COND43:.+]] = phi i32 [ %[[ADD39]], %[[COND_TRUE38]] ], [ %[[ADD41]], %[[COND_FALSE40]] ]
+// IR-NEXT:    %[[CMP44:.+]] = icmp ult i32 %[[TMP31]], %[[COND43]]
+// IR-NEXT:    br i1 %[[CMP44]], label %[[FOR_BODY45:.+]], label %[[FOR_END:.+]]
 // IR-EMPTY:
-// IR-NEXT:  [[FOR_BODY46]]:
+// IR-NEXT:  [[FOR_BODY45]]:
 // IR-NEXT:    %[[TMP36:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_]], align 4
 // IR-NEXT:    %[[TMP37:.+]] = load i32, ptr %[[DOTTILE_0_IV_I]], align 4
-// IR-NEXT:    %[[TMP38:.+]] = load i32, ptr %[[DOTCAPTURE_EXPR_2]], align 4
-// IR-NEXT:    %[[MUL47:.+]] = mul i32 %[[TMP37]], %[[TMP38]]
-// IR-NEXT:    %[[ADD48:.+]] = add i32 %[[TMP36]], %[[MUL47]]
-// IR-NEXT:    store i32 %[[ADD48]], ptr %[[I]], align 4
+// IR-NEXT:    %[[TMP38:.+]] = load i32, ptr %[[DOTNEW_STEP]], align 4
+// IR-NEXT:    %[[MUL46:.+]] = mul i32 %[[TMP37]], %[[TMP38]]
+// IR-NEXT:    %[[ADD47:.+]] = add i32 %[[TMP36]], %[[MUL46]]
+// IR-NEXT:    store i32 %[[ADD47]], ptr %[[I]], align 4
 // IR-NEXT:    %[[TMP39:.+]] = load i32, ptr %[[START_ADDR]], align 4
 // IR-NEXT:    %[[TMP40:.+]] = load i32, ptr %[[END_ADDR]], align 4
 // IR-NEXT:    %[[TMP41:.+]] = load i32, ptr %[[STEP_ADDR]], align 4
@@ -201,20 +201,20 @@ extern "C" void body(...) {}
 // IR-EMPTY:
 // IR-NEXT:  [[FOR_INC]]:
 // IR-NEXT:    %[[TMP43:.+]] = load i32, ptr %[[DOTTILE_0_IV_I]], align 4
-// IR-NEXT:    %[[INC:.+]] = add nsw i32 %[[TMP43]], 1
+// IR-NEXT:    %[[INC:.+]] = add i32 %[[TMP43]], 1
 // IR-NEXT:    store i32 %[[INC]], ptr %[[DOTTILE_0_IV_I]], align 4
-// IR-NEXT:    br label %[[FOR_COND35]], !llvm.loop ![[LOOP2:[0-9]+]]
+// IR-NEXT:    br label %[[FOR_COND34]], !llvm.loop ![[LOOP3:[0-9]+]]
 // IR-EMPTY:
 // IR-NEXT:  [[FOR_END]]:
-// IR-NEXT:    br label %[[FOR_INC49:.+]]
+// IR-NEXT:    br label %[[FOR_INC48:.+]]
 // IR-EMPTY:
-// IR-NEXT:  [[FOR_INC49]]:
+// IR-NEXT:  [[FOR_INC48]]:
 // IR-NEXT:    %[[TMP44:.+]] = load i32, ptr %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4
-// IR-NEXT:    %[[INC50:.+]] = add i32 %[[TMP44]], 1
-// IR-NEXT:    store i32 %[[INC50]], ptr %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4
-// IR-NEXT:    br label %[[FOR_COND]], !llvm.loop ![[LOOP4:[0-9]+]]
+// IR-NEXT:    %[[INC49:.+]] = add i32 %[[TMP44]], 1
+// IR-NEXT:    store i32 %[[INC49]], ptr %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4
+// IR-NEXT:    br label %[[FOR_COND]], !llvm.loop ![[LOOP5:[0-9]+]]
 // IR-EMPTY:
-// IR-NEXT:  [[FOR_END51]]:
+// IR-NEXT:  [[FOR_END50]]:
 // IR-NEXT:    br label %[[OMP_BODY_CONTINUE:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_BODY_CONTINUE]]:
@@ -222,21 +222,23 @@ extern "C" void body(...) {}
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_INNER_FOR_INC]]:
 // IR-NEXT:    %[[TMP45:.+]] = load i32, ptr %[[DOTOMP_IV]], align 4
-// IR-NEXT:    %[[ADD52:.+]] = add i32 %[[TMP45]], 1
-// IR-NEXT:    store i32 %[[ADD52]], ptr %[[DOTOMP_IV]], align 4
+// IR-NEXT:    %[[ADD51:.+]] = add i32 %[[TMP45]], 1
+// IR-NEXT:    store i32 %[[ADD51]], ptr %[[DOTOMP_IV]], align 4
 // IR-NEXT:    br label %[[OMP_INNER_FOR_COND]]
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_INNER_FOR_END]]:
 // IR-NEXT:    br label %[[OMP_LOOP_EXIT:.+]]
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_LOOP_EXIT]]:
-// IR-NEXT:    call void @__kmpc_for_static_fini(ptr @1, i32 %[[TMP0]])
+// IR-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 %[[TMP0]])
 // IR-NEXT:    br label %[[OMP_PRECOND_END]]
 // IR-EMPTY:
 // IR-NEXT:  [[OMP_PRECOND_END]]:
-// IR-NEXT:    call void @__kmpc_barrier(ptr @3, i32 %[[TMP0]])
+// IR-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB3:.+]], i32 %[[TMP0]])
 // IR-NEXT:    ret void
 // IR-NEXT:  }
+
+
 extern "C" void func(int start, int end, int step) {
 #pragma omp for
 #pragma omp tile sizes(3)
@@ -246,8 +248,10 @@ extern "C" void func(int start, int end, int step) {
 }
 
 #endif /* HEADER */
+
 // IR: ![[META0:[0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-// IR: ![[META1:[0-9]+]] = !{!"{{[^"]*}}"}
-// IR: ![[LOOP2]] = distinct !{![[LOOP2]], ![[LOOPPROP3:[0-9]+]]}
-// IR: ![[LOOPPROP3]] = !{!"llvm.loop.mustprogress"}
-// IR: ![[LOOP4]] = distinct !{![[LOOP4]], ![[LOOPPROP3]]}
+// IR: ![[META1:[0-9]+]] = !{i32 7, !"openmp", i32 51}
+// IR: ![[META2:[0-9]+]] =
+// IR: ![[LOOP3]] = distinct !{![[LOOP3]], ![[LOOPPROP4:[0-9]+]]}
+// IR: ![[LOOPPROP4]] = !{!"llvm.loop.mustprogress"}
+// IR: ![[LOOP5]] = distinct !{![[LOOP5]], ![[LOOPPROP4]]}
diff --git a/clang/test/PCH/cxx1z-aligned-alloc.cpp b/clang/test/PCH/cxx1z-aligned-alloc.cpp
index c1becbde3bf2..cccd62859784 100644
--- a/clang/test/PCH/cxx1z-aligned-alloc.cpp
+++ b/clang/test/PCH/cxx1z-aligned-alloc.cpp
@@ -1,12 +1,12 @@
 // No PCH:
-// RUN: %clang_cc1 -pedantic -fsized-deallocation -std=c++1z -include %s -verify %s
+// RUN: %clang_cc1 -pedantic -std=c++1z -include %s -verify %s
 //
 // With PCH:
-// RUN: %clang_cc1 -pedantic -fsized-deallocation -std=c++1z -emit-pch %s -o %t
-// RUN: %clang_cc1 -pedantic -fsized-deallocation -std=c++1z -include-pch %t -verify %s
+// RUN: %clang_cc1 -pedantic -std=c++1z -emit-pch %s -o %t
+// RUN: %clang_cc1 -pedantic -std=c++1z -include-pch %t -verify %s
 
-// RUN: %clang_cc1 -pedantic -fsized-deallocation -std=c++1z -emit-pch -fpch-instantiate-templates %s -o %t
-// RUN: %clang_cc1 -pedantic -fsized-deallocation -std=c++1z -include-pch %t -verify %s
+// RUN: %clang_cc1 -pedantic -std=c++1z -emit-pch -fpch-instantiate-templates %s -o %t
+// RUN: %clang_cc1 -pedantic -std=c++1z -include-pch %t -verify %s
 
 // expected-no-diagnostics
 
diff --git a/clang/test/Parser/altivec.c b/clang/test/Parser/altivec.c
index 445369f0dc06..9291b9b69160 100644
--- a/clang/test/Parser/altivec.c
+++ b/clang/test/Parser/altivec.c
@@ -56,40 +56,40 @@ void f_a2(int b, vector int a);
 vector int v = (vector int)(-1);
 
 // These should have errors on AIX and warnings otherwise.
-__vector long vv_l;                 // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector long vv_l;                 // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-__vector signed long vv_sl;         // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector signed long vv_sl;         // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-__vector unsigned long vv_ul;       // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector unsigned long vv_ul;       // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-__vector long int vv_li;            // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector long int vv_li;            // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-__vector signed long int vv_sli;    // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector signed long int vv_sli;    // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-__vector unsigned long int vv_uli;  // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector unsigned long int vv_uli;  // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector long v_l;                    // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector long v_l;                    // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector signed long v_sl;            // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector signed long v_sl;            // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector unsigned long v_ul;          // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector unsigned long v_ul;          // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector long int v_li;               // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector long int v_li;               // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector signed long int v_sli;       // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector signed long int v_sli;       // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector unsigned long int v_uli;     // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector unsigned long int v_uli;     // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
 
diff --git a/clang/test/Parser/cxx-altivec.cpp b/clang/test/Parser/cxx-altivec.cpp
index 5cb760dababb..15a6bf6d1be8 100644
--- a/clang/test/Parser/cxx-altivec.cpp
+++ b/clang/test/Parser/cxx-altivec.cpp
@@ -59,40 +59,40 @@ void f_a2(int b, vector int a);
 vector int v = (vector int)(-1);
 
 // These should have errors on AIX and warnings otherwise.
-__vector long vv_l;                 // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector long vv_l;                 // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-__vector signed long vv_sl;         // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector signed long vv_sl;         // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-__vector unsigned long vv_ul;       // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector unsigned long vv_ul;       // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-__vector long int vv_li;            // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector long int vv_li;            // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-__vector signed long int vv_sli;    // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector signed long int vv_sli;    // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-__vector unsigned long int vv_uli;  // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+__vector unsigned long int vv_uli;  // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector long v_l;                    // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector long v_l;                    // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector signed long v_sl;            // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector signed long v_sl;            // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector unsigned long v_ul;          // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector unsigned long v_ul;          // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector long int v_li;               // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector long int v_li;               // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector signed long int v_sli;       // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector signed long int v_sli;       // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
-vector unsigned long int v_uli;     // nonaix-warning {{Use of 'long' with '__vector' is deprecated}}
+vector unsigned long int v_uli;     // nonaix-warning {{use of 'long' with '__vector' is deprecated}}
                                     // aix-error@-1 {{cannot use 'long' with '__vector'}}
                                     // novsx-error@-2 {{cannot use 'long' with '__vector'}}
 
diff --git a/clang/test/Parser/lax-conv.cpp b/clang/test/Parser/lax-conv.cpp
index f784e3fa74e7..0cb2503a9691 100644
--- a/clang/test/Parser/lax-conv.cpp
+++ b/clang/test/Parser/lax-conv.cpp
@@ -21,10 +21,10 @@ template <typename VEC> VEC __attribute__((noinline)) test(vector unsigned char
     return (VEC)(a * b);
 }
 vector unsigned int test1(vector unsigned char RetImplicitConv) {
-  return RetImplicitConv; // expected-warning {{Implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}} 
+  return RetImplicitConv; // expected-warning {{implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 }
 vector unsigned int test2(vector unsigned char RetImplicitConvAddConst) {
-  return RetImplicitConvAddConst + 5; // expected-warning {{Implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}} 
+  return RetImplicitConvAddConst + 5; // expected-warning {{implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 }
 vector unsigned int test3(vector unsigned char RetExplicitConv) {
   return (vector unsigned int)RetExplicitConv;
@@ -34,7 +34,7 @@ vector unsigned int test4(vector unsigned char RetExplicitConvAddConst) {
 }
 vector unsigned int test5(vector unsigned char RetImplicitConvAddSame1,
                           vector unsigned char RetImplicitConvAddSame2) {
-  return RetImplicitConvAddSame1 + RetImplicitConvAddSame2; // expected-warning {{Implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}} 
+  return RetImplicitConvAddSame1 + RetImplicitConvAddSame2; // expected-warning {{implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 }
 vector unsigned int test6(vector unsigned char RetExplicitConvAddSame1,
                           vector unsigned char RetExplicitConvAddSame2) {
@@ -54,10 +54,10 @@ vector unsigned long long test9(vector unsigned char a, vector unsigned char b)
     return test<vector unsigned long long>(a, b);
 }
 void test1a(vector unsigned char ArgImplicitConv) {
-  return dummy(ArgImplicitConv); // expected-warning {{Implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
+  return dummy(ArgImplicitConv); // expected-warning {{implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 }
 void test2a(vector unsigned char ArgImplicitConvAddConst) {
-  return dummy(ArgImplicitConvAddConst + 5); // expected-warning {{Implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
+  return dummy(ArgImplicitConvAddConst + 5); // expected-warning {{implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 }
 void test3a(vector unsigned char ArgExplicitConv) {
   return dummy((vector unsigned int)ArgExplicitConv);
@@ -67,7 +67,7 @@ void test4a(vector unsigned char ArgExplicitConvAddConst) {
 }
 void test5a(vector unsigned char ArgImplicitConvAddSame1,
             vector unsigned char ArgImplicitConvAddSame2) {
-  return dummy(ArgImplicitConvAddSame1 + ArgImplicitConvAddSame2); // expected-warning {{Implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
+  return dummy(ArgImplicitConvAddSame1 + ArgImplicitConvAddSame2); // expected-warning {{implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector unsigned int' (vector of 4 'unsigned int' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 }
 void test6a(vector unsigned char ArgExplicitConvAddSame1,
             vector unsigned char ArgExplicitConvAddSame2) {
@@ -80,33 +80,33 @@ void test7a(vector unsigned char ArgExplicitConvAddSame1Full,
                                      ArgExplicitConvAddSame2Full));
 }
 void test_bool_compat(void) {
-  vbs = vss; // expected-warning {{Implicit conversion between vector types (''__vector short' (vector of 8 'short' values)' and ''__vector __bool unsigned short' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vbs = vus; // expected-warning {{Implicit conversion between vector types (''__vector unsigned short' (vector of 8 'unsigned short' values)' and ''__vector __bool unsigned short' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
+  vbs = vss; // expected-warning {{implicit conversion between vector types (''__vector short' (vector of 8 'short' values)' and ''__vector __bool unsigned short' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vbs = vus; // expected-warning {{implicit conversion between vector types (''__vector unsigned short' (vector of 8 'unsigned short' values)' and ''__vector __bool unsigned short' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 
-  vbi = vsi; // expected-warning {{Implicit conversion between vector types (''__vector int' (vector of 4 'int' values)' and ''__vector __bool unsigned int' (vector of 4 'unsigned int' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vbi = vui; // expected-warning {{Implicit conversion between vector types (''__vector unsigned int' (vector of 4 'unsigned int' values)' and ''__vector __bool unsigned int' (vector of 4 'unsigned int' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
+  vbi = vsi; // expected-warning {{implicit conversion between vector types (''__vector int' (vector of 4 'int' values)' and ''__vector __bool unsigned int' (vector of 4 'unsigned int' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vbi = vui; // expected-warning {{implicit conversion between vector types (''__vector unsigned int' (vector of 4 'unsigned int' values)' and ''__vector __bool unsigned int' (vector of 4 'unsigned int' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 
-  vbl = vsl; // expected-warning {{Implicit conversion between vector types (''__vector long long' (vector of 2 'long long' values)' and ''__vector __bool unsigned long long' (vector of 2 'unsigned long long' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vbl = vul; // expected-warning {{Implicit conversion between vector types (''__vector unsigned long long' (vector of 2 'unsigned long long' values)' and ''__vector __bool unsigned long long' (vector of 2 'unsigned long long' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
+  vbl = vsl; // expected-warning {{implicit conversion between vector types (''__vector long long' (vector of 2 'long long' values)' and ''__vector __bool unsigned long long' (vector of 2 'unsigned long long' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vbl = vul; // expected-warning {{implicit conversion between vector types (''__vector unsigned long long' (vector of 2 'unsigned long long' values)' and ''__vector __bool unsigned long long' (vector of 2 'unsigned long long' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 
-  vbc = vsc; // expected-warning {{Implicit conversion between vector types (''__vector signed char' (vector of 16 'signed char' values)' and ''__vector __bool unsigned char' (vector of 16 'unsigned char' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vbc = vuc; // expected-warning {{Implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector __bool unsigned char' (vector of 16 'unsigned char' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
+  vbc = vsc; // expected-warning {{implicit conversion between vector types (''__vector signed char' (vector of 16 'signed char' values)' and ''__vector __bool unsigned char' (vector of 16 'unsigned char' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vbc = vuc; // expected-warning {{implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector __bool unsigned char' (vector of 16 'unsigned char' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 }
 
 void test_pixel_compat(void) {
-  vp = vbs; // expected-warning {{Implicit conversion between vector types (''__vector __bool unsigned short' (vector of 8 'unsigned short' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vp = vss; // expected-warning {{Implicit conversion between vector types (''__vector short' (vector of 8 'short' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vp = vus; // expected-warning {{Implicit conversion between vector types (''__vector unsigned short' (vector of 8 'unsigned short' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
+  vp = vbs; // expected-warning {{implicit conversion between vector types (''__vector __bool unsigned short' (vector of 8 'unsigned short' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vp = vss; // expected-warning {{implicit conversion between vector types (''__vector short' (vector of 8 'short' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vp = vus; // expected-warning {{implicit conversion between vector types (''__vector unsigned short' (vector of 8 'unsigned short' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 
-  vp = vbi; // expected-warning {{Implicit conversion between vector types (''__vector __bool unsigned int' (vector of 4 'unsigned int' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vp = vsi; // expected-warning {{Implicit conversion between vector types (''__vector int' (vector of 4 'int' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vp = vui; // expected-warning {{Implicit conversion between vector types (''__vector unsigned int' (vector of 4 'unsigned int' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
+  vp = vbi; // expected-warning {{implicit conversion between vector types (''__vector __bool unsigned int' (vector of 4 'unsigned int' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vp = vsi; // expected-warning {{implicit conversion between vector types (''__vector int' (vector of 4 'int' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vp = vui; // expected-warning {{implicit conversion between vector types (''__vector unsigned int' (vector of 4 'unsigned int' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 
-  vp = vbl; // expected-warning {{Implicit conversion between vector types (''__vector __bool unsigned long long' (vector of 2 'unsigned long long' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vp = vsl; // expected-warning {{Implicit conversion between vector types (''__vector long long' (vector of 2 'long long' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vp = vul; // expected-warning {{Implicit conversion between vector types (''__vector unsigned long long' (vector of 2 'unsigned long long' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
+  vp = vbl; // expected-warning {{implicit conversion between vector types (''__vector __bool unsigned long long' (vector of 2 'unsigned long long' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vp = vsl; // expected-warning {{implicit conversion between vector types (''__vector long long' (vector of 2 'long long' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vp = vul; // expected-warning {{implicit conversion between vector types (''__vector unsigned long long' (vector of 2 'unsigned long long' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 
-  vp = vbc; // expected-warning {{Implicit conversion between vector types (''__vector __bool unsigned char' (vector of 16 'unsigned char' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vp = vsc; // expected-warning {{Implicit conversion between vector types (''__vector signed char' (vector of 16 'signed char' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
-  vp = vuc; // expected-warning {{Implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated. In the future, the behavior implied by '-fno-lax-vector-conversions' will be the default.}}
+  vp = vbc; // expected-warning {{implicit conversion between vector types (''__vector __bool unsigned char' (vector of 16 'unsigned char' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vp = vsc; // expected-warning {{implicit conversion between vector types (''__vector signed char' (vector of 16 'signed char' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
+  vp = vuc; // expected-warning {{implicit conversion between vector types (''__vector unsigned char' (vector of 16 'unsigned char' values)' and ''__vector __pixel ' (vector of 8 'unsigned short' values)') is deprecated; in the future, the behavior implied by '-fno-lax-vector-conversions' will be the default}}
 }
diff --git a/clang/test/Parser/objcbridge-related-attribute.m b/clang/test/Parser/objcbridge-related-attribute.m
index 246afeef5198..e76d5e388141 100644
--- a/clang/test/Parser/objcbridge-related-attribute.m
+++ b/clang/test/Parser/objcbridge-related-attribute.m
@@ -5,10 +5,10 @@ typedef struct __attribute__((objc_bridge_related(NSColor,,CGColor))) CGColor *C
 typedef struct __attribute__((objc_bridge_related(NSColor,,))) CGColor *CGColorRef2Ok;
 typedef struct __attribute__((objc_bridge_related(NSColor,colorWithCGColor:,))) CGColor *CGColorRef3Ok;
 
-typedef struct __attribute__((objc_bridge_related(,colorWithCGColor:,CGColor))) CGColor *CGColorRef1NotOk; // expected-error {{expected a related ObjectiveC class name, e.g., 'NSColor'}}
+typedef struct __attribute__((objc_bridge_related(,colorWithCGColor:,CGColor))) CGColor *CGColorRef1NotOk; // expected-error {{expected a related Objective-C class name, e.g., 'NSColor'}}
 typedef struct __attribute__((objc_bridge_related(NSColor,colorWithCGColor,CGColor))) CGColor *CGColorRef2NotOk; // expected-error {{expected a class method selector with single argument, e.g., 'colorWithCGColor:'}}
 typedef struct __attribute__((objc_bridge_related(NSColor,colorWithCGColor::,CGColor))) CGColor *CGColorRef3NotOk; // expected-error {{expected a class method selector with single argument, e.g., 'colorWithCGColor:'}}
-typedef struct __attribute__((objc_bridge_related(12,colorWithCGColor:,CGColor))) CGColor *CGColorRef4NotOk; // expected-error {{expected a related ObjectiveC class name, e.g., 'NSColor'}}
+typedef struct __attribute__((objc_bridge_related(12,colorWithCGColor:,CGColor))) CGColor *CGColorRef4NotOk; // expected-error {{expected a related Objective-C class name, e.g., 'NSColor'}}
 typedef struct __attribute__((objc_bridge_related(NSColor,+:,CGColor))) CGColor *CGColorRef5NotOk; // expected-error {{expected ','}}
 typedef struct __attribute__((objc_bridge_related(NSColor,colorWithCGColor:,+))) CGColor *CGColorRef6NotOk; // expected-error {{expected ')'}}
 
diff --git a/clang/test/Parser/pragma-attribute.cpp b/clang/test/Parser/pragma-attribute.cpp
index bc8e7b9e78c6..6377fc754352 100644
--- a/clang/test/Parser/pragma-attribute.cpp
+++ b/clang/test/Parser/pragma-attribute.cpp
@@ -127,7 +127,7 @@ void function();
 // expected-error@-1 {{attribute 'objc_bridge_related' can't be applied to 'function'}}
 #pragma clang attribute pop
 
-#pragma clang attribute push (__attribute__((objc_bridge_related(1))), apply_to=function) // expected-error {{expected a related ObjectiveC class name, e.g., 'NSColor'}}
+#pragma clang attribute push (__attribute__((objc_bridge_related(1))), apply_to=function) // expected-error {{expected a related Objective-C class name, e.g., 'NSColor'}}
 
 #pragma clang attribute push (__attribute__((used)), apply_to=function) // expected-error {{attribute 'used' is not supported by '#pragma clang attribute'}}
 
diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
index ca51f2fc22c5..f0a2ef851287 100644
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -793,9 +793,7 @@
 // CHECK_KNL_M32: #define __AES__ 1
 // CHECK_KNL_M32: #define __AVX2__ 1
 // CHECK_KNL_M32: #define __AVX512CD__ 1
-// CHECK_KNL_M32: #define __AVX512ER__ 1
 // CHECK_KNL_M32: #define __AVX512F__ 1
-// CHECK_KNL_M32: #define __AVX512PF__ 1
 // CHECK_KNL_M32: #define __AVX__ 1
 // CHECK_KNL_M32: #define __BMI2__ 1
 // CHECK_KNL_M32: #define __BMI__ 1
@@ -808,7 +806,6 @@
 // CHECK_KNL_M32: #define __MOVBE__ 1
 // CHECK_KNL_M32: #define __PCLMUL__ 1
 // CHECK_KNL_M32: #define __POPCNT__ 1
-// CHECK_KNL_M32: #define __PREFETCHWT1__ 1
 // CHECK_KNL_M32: #define __PRFCHW__ 1
 // CHECK_KNL_M32: #define __RDRND__ 1
 // CHECK_KNL_M32: #define __SSE2__ 1
@@ -832,9 +829,7 @@
 // CHECK_KNL_M64: #define __AES__ 1
 // CHECK_KNL_M64: #define __AVX2__ 1
 // CHECK_KNL_M64: #define __AVX512CD__ 1
-// CHECK_KNL_M64: #define __AVX512ER__ 1
 // CHECK_KNL_M64: #define __AVX512F__ 1
-// CHECK_KNL_M64: #define __AVX512PF__ 1
 // CHECK_KNL_M64: #define __AVX__ 1
 // CHECK_KNL_M64: #define __BMI2__ 1
 // CHECK_KNL_M64: #define __BMI__ 1
@@ -847,7 +842,6 @@
 // CHECK_KNL_M64: #define __MOVBE__ 1
 // CHECK_KNL_M64: #define __PCLMUL__ 1
 // CHECK_KNL_M64: #define __POPCNT__ 1
-// CHECK_KNL_M64: #define __PREFETCHWT1__ 1
 // CHECK_KNL_M64: #define __PRFCHW__ 1
 // CHECK_KNL_M64: #define __RDRND__ 1
 // CHECK_KNL_M64: #define __SSE2_MATH__ 1
@@ -874,9 +868,7 @@
 // CHECK_KNM_M32: #define __AES__ 1
 // CHECK_KNM_M32: #define __AVX2__ 1
 // CHECK_KNM_M32: #define __AVX512CD__ 1
-// CHECK_KNM_M32: #define __AVX512ER__ 1
 // CHECK_KNM_M32: #define __AVX512F__ 1
-// CHECK_KNM_M32: #define __AVX512PF__ 1
 // CHECK_KNM_M32: #define __AVX512VPOPCNTDQ__ 1
 // CHECK_KNM_M32: #define __AVX__ 1
 // CHECK_KNM_M32: #define __BMI2__ 1
@@ -890,7 +882,6 @@
 // CHECK_KNM_M32: #define __MOVBE__ 1
 // CHECK_KNM_M32: #define __PCLMUL__ 1
 // CHECK_KNM_M32: #define __POPCNT__ 1
-// CHECK_KNM_M32: #define __PREFETCHWT1__ 1
 // CHECK_KNM_M32: #define __PRFCHW__ 1
 // CHECK_KNM_M32: #define __RDRND__ 1
 // CHECK_KNM_M32: #define __SSE2__ 1
@@ -911,9 +902,7 @@
 // CHECK_KNM_M64: #define __AES__ 1
 // CHECK_KNM_M64: #define __AVX2__ 1
 // CHECK_KNM_M64: #define __AVX512CD__ 1
-// CHECK_KNM_M64: #define __AVX512ER__ 1
 // CHECK_KNM_M64: #define __AVX512F__ 1
-// CHECK_KNM_M64: #define __AVX512PF__ 1
 // CHECK_KNM_M64: #define __AVX512VPOPCNTDQ__ 1
 // CHECK_KNM_M64: #define __AVX__ 1
 // CHECK_KNM_M64: #define __BMI2__ 1
@@ -927,7 +916,6 @@
 // CHECK_KNM_M64: #define __MOVBE__ 1
 // CHECK_KNM_M64: #define __PCLMUL__ 1
 // CHECK_KNM_M64: #define __POPCNT__ 1
-// CHECK_KNM_M64: #define __PREFETCHWT1__ 1
 // CHECK_KNM_M64: #define __PRFCHW__ 1
 // CHECK_KNM_M64: #define __RDRND__ 1
 // CHECK_KNM_M64: #define __SSE2_MATH__ 1
diff --git a/clang/test/Preprocessor/stdc-ms-extension.cpp b/clang/test/Preprocessor/stdc-ms-extension.cpp
new file mode 100644
index 000000000000..6e9fa6055306
--- /dev/null
+++ b/clang/test/Preprocessor/stdc-ms-extension.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cl /TC /dev/null /E -Xclang -dM 2> /dev/null | FileCheck -match-full-lines %s --check-prefix=NOSTDC
+// RUN: %clang_cl /TC /dev/null /E -Xclang -dM /Zc:__STDC__ 2> /dev/null | FileCheck -match-full-lines %s --check-prefix=YESSTDC
+// __STDC__ should never be defined in C++ mode with fms-compatibility.
+// RUN: %clang_cl /dev/null /E -Xclang -dM 2>&1 | FileCheck %s --check-prefix=NOSTDC
+// RUN: %clang_cl /dev/null /E -Xclang -dM /Zc:__STDC__ 2>&1 | FileCheck %s --check-prefix=ZCSTDCIGNORED
+// YESSTDC: #define __STDC__ 1
+// NOSTDC-NOT: #define __STDC__ 1
+// ZCSTDCIGNORED-NOT: #define __STDC__ 1
+// ZCSTDCIGNORED: argument unused during compilation
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index 57104c9e7a50..7567267be26b 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -90,38 +90,6 @@
 // AVX512CD: #define __SSE__ 1
 // AVX512CD: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512er -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512ER %s
-
-// AVX512ER: #define __AVX2__ 1
-// AVX512ER: #define __AVX512ER__ 1
-// AVX512ER: #define __AVX512F__ 1
-// AVX512ER: #define __AVX__ 1
-// AVX512ER: #define __EVEX512__ 1
-// AVX512ER: #define __SSE2_MATH__ 1
-// AVX512ER: #define __SSE2__ 1
-// AVX512ER: #define __SSE3__ 1
-// AVX512ER: #define __SSE4_1__ 1
-// AVX512ER: #define __SSE4_2__ 1
-// AVX512ER: #define __SSE_MATH__ 1
-// AVX512ER: #define __SSE__ 1
-// AVX512ER: #define __SSSE3__ 1
-
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512pf -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512PF %s
-
-// AVX512PF: #define __AVX2__ 1
-// AVX512PF: #define __AVX512F__ 1
-// AVX512PF: #define __AVX512PF__ 1
-// AVX512PF: #define __AVX__ 1
-// AVX512PF: #define __EVEX512__ 1
-// AVX512PF: #define __SSE2_MATH__ 1
-// AVX512PF: #define __SSE2__ 1
-// AVX512PF: #define __SSE3__ 1
-// AVX512PF: #define __SSE4_1__ 1
-// AVX512PF: #define __SSE4_2__ 1
-// AVX512PF: #define __SSE_MATH__ 1
-// AVX512PF: #define __SSE__ 1
-// AVX512PF: #define __SSSE3__ 1
-
 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512dq -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512DQ %s
 
 // AVX512DQ: #define __AVX2__ 1
@@ -171,22 +139,6 @@
 // AVX512VL: #define __SSE__ 1
 // AVX512VL: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512pf -mno-avx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512F2 %s
-
-// AVX512F2: #define __AVX2__ 1
-// AVX512F2-NOT: #define __AVX512F__ 1
-// AVX512F2-NOT: #define __AVX512PF__ 1
-// AVX512F2-NOT: #define __EVEX512__ 1
-// AVX512F2: #define __AVX__ 1
-// AVX512F2: #define __SSE2_MATH__ 1
-// AVX512F2: #define __SSE2__ 1
-// AVX512F2: #define __SSE3__ 1
-// AVX512F2: #define __SSE4_1__ 1
-// AVX512F2: #define __SSE4_2__ 1
-// AVX512F2: #define __SSE_MATH__ 1
-// AVX512F2: #define __SSE__ 1
-// AVX512F2: #define __SSSE3__ 1
-
 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512ifma -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512IFMA %s
 
 // AVX512IFMA: #define __AVX2__ 1
@@ -640,14 +592,12 @@
 
 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512f -mno-avx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOEVEX512 %s
 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512cd -mno-avx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOEVEX512 %s
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512er -mno-avx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOEVEX512 %s
 // NOEVEX512-NOT: #define __AVX512F__ 1
 // NOEVEX512-NOT: #define __EVEX256__ 1
 // NOEVEX512-NOT: #define __EVEX512__ 1
 
 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512f -mno-evex512 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512NOEVEX512 %s
 // RUN: %clang -target i386-unknown-unknown -march=atom -mavx512cd -mno-evex512 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512NOEVEX512 %s
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512er -mno-evex512 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512NOEVEX512 %s
 // AVX512NOEVEX512: #define __AVX512F__ 1
 // AVX512NOEVEX512-NOT: #define __EVEX256__ 1
 // AVX512NOEVEX512-NOT: #define __EVEX512__ 1
diff --git a/clang/test/Profile/misexpect-branch.c b/clang/test/Profile/misexpect-branch.c
index ce46b4688061..5c4394405e17 100644
--- a/clang/test/Profile/misexpect-branch.c
+++ b/clang/test/Profile/misexpect-branch.c
@@ -26,10 +26,10 @@ int buzz();
 const int inner_loop = 100;
 const int outer_loop = 2000;
 
-int bar() { // imprecise-warning-re {{Potential performance regression from use of __builtin_expect(): Annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions.}}
+int bar() { // imprecise-warning-re {{potential performance regression from use of __builtin_expect(): annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions}}
   int rando = buzz();
   int x = 0;
-  if (likely(rando % (outer_loop * inner_loop) == 0)) { // exact-warning-re {{Potential performance regression from use of __builtin_expect(): Annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions.}}
+  if (likely(rando % (outer_loop * inner_loop) == 0)) { // exact-warning-re {{potential performance regression from use of __builtin_expect(): annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions}}
     x = baz(rando);
   } else {
     x = foo(50);
@@ -37,10 +37,10 @@ int bar() { // imprecise-warning-re {{Potential performance regression from use
   return x;
 }
 
-int fizz() { // imprecise-warning-re {{Potential performance regression from use of __builtin_expect(): Annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions.}}
+int fizz() { // imprecise-warning-re {{potential performance regression from use of __builtin_expect(): annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions}}
   int rando = buzz();
   int x = 0;
-  if (unlikely(rando % (outer_loop * inner_loop) == 0)) { // exact-warning-re {{Potential performance regression from use of __builtin_expect(): Annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions.}}
+  if (unlikely(rando % (outer_loop * inner_loop) == 0)) { // exact-warning-re {{potential performance regression from use of __builtin_expect(): annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions}}
     x = baz(rando);
   } else {
     x = foo(50);
diff --git a/clang/test/Profile/misexpect-switch-default.c b/clang/test/Profile/misexpect-switch-default.c
index 033490e558e6..cd337b943017 100644
--- a/clang/test/Profile/misexpect-switch-default.c
+++ b/clang/test/Profile/misexpect-switch-default.c
@@ -20,7 +20,7 @@ int main() {
   int j;
   for (j = 0; j < outer_loop * inner_loop; ++j) {
     unsigned condition = rand() % 5;
-    switch (__builtin_expect(condition, 6)) { // expected-warning-re {{Potential performance regression from use of __builtin_expect(): Annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions.}}
+    switch (__builtin_expect(condition, 6)) { // expected-warning-re {{potential performance regression from use of __builtin_expect(): annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions}}
     case 0:
       val += sum(arry, arry_size);
       break;
diff --git a/clang/test/Profile/misexpect-switch.c b/clang/test/Profile/misexpect-switch.c
index 8ca8a155c74a..84a7174f635f 100644
--- a/clang/test/Profile/misexpect-switch.c
+++ b/clang/test/Profile/misexpect-switch.c
@@ -20,7 +20,7 @@ int main() {
   for (j = 0; j < outer_loop; ++j) {
     for (k = 0; k < inner_loop; ++k) {
       unsigned condition = rand() % 10000;
-      switch (__builtin_expect(condition, 0)) { // expected-warning-re {{Potential performance regression from use of __builtin_expect(): Annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions.}}
+      switch (__builtin_expect(condition, 0)) { // expected-warning-re {{potential performance regression from use of __builtin_expect(): annotation was correct on {{.+}}% ({{[0-9]+ / [0-9]+}}) of profiled executions}}
       case 0:
         val += sum(arry, arry_size);
         break;
diff --git a/clang/test/Sema/attr-assume.c b/clang/test/Sema/attr-assume.c
deleted file mode 100644
index 98deffa3a746..000000000000
--- a/clang/test/Sema/attr-assume.c
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: %clang_cc1 -triple i386-apple-darwin9 -fsyntax-only -verify %s
-
-void f1(void) __attribute__((assume(3))); // expected-error {{expected string literal as argument of 'assume' attribute}}
-void f2(void) __attribute__((assume(int))); // expected-error {{expected string literal as argument of 'assume' attribute}}
-void f3(void) __attribute__((assume(for))); // expected-error {{expected string literal as argument of 'assume' attribute}}
-void f4(void) __attribute__((assume("QQQQ"))); // expected-warning {{unknown assumption string 'QQQQ'; attribute is potentially ignored}}
-void f5(void) __attribute__((assume("omp_no_openmp")));
-void f6(void) __attribute__((assume("omp_noopenmp"))); // expected-warning {{unknown assumption string 'omp_noopenmp' may be misspelled; attribute is potentially ignored, did you mean 'omp_no_openmp'?}}
-void f7(void) __attribute__((assume("omp_no_openmp_routine"))); // expected-warning {{unknown assumption string 'omp_no_openmp_routine' may be misspelled; attribute is potentially ignored, did you mean 'omp_no_openmp_routines'?}}
-void f8(void) __attribute__((assume("omp_no_openmp1"))); // expected-warning {{unknown assumption string 'omp_no_openmp1' may be misspelled; attribute is potentially ignored, did you mean 'omp_no_openmp'?}}
-void f9(void) __attribute__((assume("omp_no_openmp", "omp_no_openmp"))); // expected-error {{'assume' attribute takes one argument}}
-
-int g1 __attribute__((assume(0))); // expected-error {{expected string literal as argument of 'assume' attribute}}
-int g2 __attribute__((assume("omp_no_openmp"))); // expected-warning {{'assume' attribute only applies to functions and Objective-C methods}}
diff --git a/clang/test/Sema/attr-counted-by-late-parsed-off.c b/clang/test/Sema/attr-counted-by-late-parsed-off.c
new file mode 100644
index 000000000000..34f51d10c083
--- /dev/null
+++ b/clang/test/Sema/attr-counted-by-late-parsed-off.c
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -DNEEDS_LATE_PARSING -fno-experimental-late-parse-attributes -fsyntax-only -verify %s
+// RUN: %clang_cc1 -DNEEDS_LATE_PARSING -fsyntax-only -verify %s
+
+// RUN: %clang_cc1 -UNEEDS_LATE_PARSING -fno-experimental-late-parse-attributes -fsyntax-only -verify=ok %s
+// RUN: %clang_cc1 -UNEEDS_LATE_PARSING -fsyntax-only -verify=ok %s
+
+#define __counted_by(f)  __attribute__((counted_by(f)))
+
+struct size_known { int dummy; };
+
+#ifdef NEEDS_LATE_PARSING
+struct on_decl {
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  struct size_known *buf __counted_by(count);
+  int count;
+};
+
+#else
+
+// ok-no-diagnostics
+struct on_decl {
+  int count;
+  struct size_known *buf __counted_by(count);
+};
+
+#endif
diff --git a/clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c b/clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c
new file mode 100644
index 000000000000..9ff3b080f657
--- /dev/null
+++ b/clang/test/Sema/attr-counted-by-late-parsed-struct-ptrs.c
@@ -0,0 +1,254 @@
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -verify %s
+
+#define __counted_by(f)  __attribute__((counted_by(f)))
+
+struct size_unknown;
+struct size_known {
+  int field;
+};
+
+typedef void(*fn_ptr_ty)(void);
+
+//==============================================================================
+// __counted_by on struct member pointer in decl attribute position
+//==============================================================================
+
+struct on_member_pointer_complete_ty {
+  struct size_known * buf __counted_by(count);
+  int count;
+};
+
+struct on_member_pointer_incomplete_ty {
+  struct size_unknown * buf __counted_by(count); // expected-error{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct size_unknown' is an incomplete type}}
+  int count;
+};
+
+struct on_member_pointer_const_incomplete_ty {
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'const struct size_unknown' is an incomplete type}}
+  const struct size_unknown * buf __counted_by(count);
+  int count;
+};
+
+struct on_member_pointer_void_ty {
+  void* buf __counted_by(count); // expected-error{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
+  int count;
+};
+
+struct on_member_pointer_fn_ptr_ty {
+  // buffer of `count` function pointers is allowed
+  void (**fn_ptr)(void) __counted_by(count);
+  int count;
+};
+
+
+struct on_member_pointer_fn_ptr_ty_ptr_ty {
+  // buffer of `count` function pointers is allowed
+  fn_ptr_ty* fn_ptr __counted_by(count);
+  int count;
+};
+
+struct on_member_pointer_fn_ty {
+  // buffer of `count` functions is not allowed
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
+  void (*fn_ptr)(void) __counted_by(count);
+  int count;
+};
+
+struct on_member_pointer_fn_ptr_ty_ty {
+  // buffer of `count` functions is not allowed
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
+  fn_ptr_ty fn_ptr __counted_by(count);
+  int count;
+};
+
+struct has_unannotated_vla {
+  int count;
+  int buffer[];
+};
+
+struct on_member_pointer_struct_with_vla {
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_unannotated_vla' is a struct type with a flexible array member}}
+  struct has_unannotated_vla* objects __counted_by(count);
+  int count;
+};
+
+struct has_annotated_vla {
+  int count;
+  int buffer[] __counted_by(count);
+};
+
+// Currently prevented because computing the size of `objects` at runtime would
+// require an O(N) walk of `objects` to take into account the length of the VLA
+// in each struct instance.
+struct on_member_pointer_struct_with_annotated_vla {
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_annotated_vla' is a struct type with a flexible array member}}
+  struct has_annotated_vla* objects __counted_by(count);
+  int count;
+};
+
+struct on_pointer_anon_buf {
+  // TODO: Support referring to parent scope
+  struct {
+    // expected-error@+1{{use of undeclared identifier 'count'}}
+    struct size_known *buf __counted_by(count);
+  };
+  int count;
+};
+
+struct on_pointer_anon_count {
+  struct size_known *buf __counted_by(count);
+  struct {
+    int count;
+  };
+};
+
+//==============================================================================
+// __counted_by on struct member pointer in type attribute position
+//==============================================================================
+// TODO: Correctly parse counted_by as a type attribute. Currently it is parsed
+// as a declaration attribute and is **not** late parsed resulting in the `count`
+// field being unavailable.
+
+struct on_member_pointer_complete_ty_ty_pos {
+  // TODO: Allow this
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  struct size_known *__counted_by(count) buf;
+  int count;
+};
+
+struct on_member_pointer_incomplete_ty_ty_pos {
+  // TODO: Allow this
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  struct size_unknown * __counted_by(count) buf;
+  int count;
+};
+
+struct on_member_pointer_const_incomplete_ty_ty_pos {
+  // TODO: Allow this
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  const struct size_unknown * __counted_by(count) buf;
+  int count;
+};
+
+struct on_member_pointer_void_ty_ty_pos {
+  // TODO: This should fail because the attribute is
+  // on a pointer with the pointee being an incomplete type.
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  void *__counted_by(count) buf;
+  int count;
+};
+
+// -
+
+struct on_member_pointer_fn_ptr_ty_pos {
+  // TODO: buffer of `count` function pointers should be allowed
+  // but fails because this isn't late parsed.
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  void (** __counted_by(count) fn_ptr)(void);
+  int count;
+};
+
+struct on_member_pointer_fn_ptr_ty_ptr_ty_pos {
+  // TODO: buffer of `count` function pointers should be allowed
+  // but fails because this isn't late parsed.
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  fn_ptr_ty* __counted_by(count) fn_ptr;
+  int count;
+};
+
+struct on_member_pointer_fn_ty_ty_pos {
+  // TODO: This should fail because the attribute is
+  // on a pointer with the pointee being a function type.
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  void (* __counted_by(count) fn_ptr)(void);
+  int count;
+};
+
+struct on_member_pointer_fn_ptr_ty_ty_pos {
+  // TODO: buffer of `count` function pointers should be allowed
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  void (** __counted_by(count) fn_ptr)(void);
+  int count;
+};
+
+struct on_member_pointer_fn_ptr_ty_typedef_ty_pos {
+  // TODO: This should fail because the attribute is
+  // on a pointer with the pointee being a function type.
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  fn_ptr_ty __counted_by(count) fn_ptr;
+  int count;
+};
+
+struct on_member_pointer_fn_ptr_ty_ty_pos_inner {
+  // TODO: This should fail because the attribute is
+  // on a pointer with the pointee being a function type.
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  void (* __counted_by(count) * fn_ptr)(void);
+  int count;
+};
+
+struct on_member_pointer_struct_with_vla_ty_pos {
+  // TODO: This should fail because the attribute is
+  // on a pointer with the pointee being a struct type with a VLA.
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  struct has_unannotated_vla *__counted_by(count) objects;
+  int count;
+};
+
+struct on_member_pointer_struct_with_annotated_vla_ty_pos {
+  // TODO: This should fail because the attribute is
+  // on a pointer with the pointee being a struct type with a VLA.
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  struct has_annotated_vla* __counted_by(count) objects;
+  int count;
+};
+
+struct on_nested_pointer_inner {
+  // TODO: This should be disallowed because in the `-fbounds-safety` model
+  // `__counted_by` can only be nested when used in function parameters.
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  struct size_known *__counted_by(count) *buf;
+  int count;
+};
+
+struct on_nested_pointer_outer {
+  // TODO: Allow this
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  struct size_known **__counted_by(count) buf;
+  int count;
+};
+
+struct on_pointer_anon_buf_ty_pos {
+  struct {
+    // TODO: Support referring to parent scope
+    // expected-error@+1{{use of undeclared identifier 'count'}}
+    struct size_known * __counted_by(count) buf;
+  };
+  int count;
+};
+
+struct on_pointer_anon_count_ty_pos {
+  // TODO: Allow this
+  // expected-error@+1{{use of undeclared identifier 'count'}}
+  struct size_known *__counted_by(count) buf;
+  struct {
+    int count;
+  };
+};
+
+//==============================================================================
+// __counted_by on struct non-pointer members
+//==============================================================================
+
+struct on_pod_ty {
+  // expected-error@+1{{'counted_by' only applies to pointers or C99 flexible array members}}
+  int wrong_ty __counted_by(count);
+  int count;
+};
+
+struct on_void_ty {
+  // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
+  // expected-error@+1{{field has incomplete type 'void'}}
+  void wrong_ty __counted_by(count);
+  int count;
+};
diff --git a/clang/test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c b/clang/test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c
new file mode 100644
index 000000000000..9b0f2eafb13c
--- /dev/null
+++ b/clang/test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c
@@ -0,0 +1,17 @@
+// __SVInt8_t is specific to ARM64 so specify that in the target triple
+// RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
+
+#define __counted_by(f)  __attribute__((counted_by(f)))
+
+struct on_sizeless_pointee_ty {
+    int count;
+    // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because '__SVInt8_t' is a sizeless type}}
+    __SVInt8_t* member __counted_by(count);
+};
+
+struct on_sizeless_ty {
+    int count;
+    // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
+    // expected-error@+1{{field has sizeless type '__SVInt8_t'}}
+    __SVInt8_t member __counted_by(count);
+};
diff --git a/clang/test/Sema/attr-counted-by-struct-ptrs.c b/clang/test/Sema/attr-counted-by-struct-ptrs.c
new file mode 100644
index 000000000000..cd2bfe36938b
--- /dev/null
+++ b/clang/test/Sema/attr-counted-by-struct-ptrs.c
@@ -0,0 +1,224 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+#define __counted_by(f)  __attribute__((counted_by(f)))
+
+struct size_unknown;
+struct size_known {
+  int field;
+};
+
+typedef void(*fn_ptr_ty)(void);
+
+//==============================================================================
+// __counted_by on struct member pointer in decl attribute position
+//==============================================================================
+
+struct on_member_pointer_complete_ty {
+  int count;
+  struct size_known * buf __counted_by(count);
+};
+
+struct on_member_pointer_incomplete_ty {
+  int count;
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct size_unknown' is an incomplete type}}
+  struct size_unknown * buf __counted_by(count);
+};
+
+struct on_member_pointer_const_incomplete_ty {
+  int count;
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'const struct size_unknown' is an incomplete type}}
+  const struct size_unknown * buf __counted_by(count);
+};
+
+struct on_member_pointer_void_ty {
+  int count;
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
+  void* buf __counted_by(count);
+};
+
+struct on_member_pointer_fn_ptr_ty {
+  int count;
+  // buffer of `count` function pointers is allowed
+  void (**fn_ptr)(void) __counted_by(count);
+};
+
+struct on_member_pointer_fn_ptr_ty_ptr_ty {
+  int count;
+  // buffer of `count` function pointers is allowed
+  fn_ptr_ty* fn_ptr __counted_by(count);
+};
+
+struct on_member_pointer_fn_ty {
+  int count;
+  // buffer of `count` functions is not allowed
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
+  void (*fn_ptr)(void) __counted_by(count);
+};
+
+struct on_member_pointer_fn_ptr_ty_ty {
+  int count;
+  // buffer of `count` functions is not allowed
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
+  fn_ptr_ty fn_ptr __counted_by(count);
+};
+
+struct has_unannotated_vla {
+  int count;
+  int buffer[];
+};
+
+struct on_member_pointer_struct_with_vla {
+  int count;
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_unannotated_vla' is a struct type with a flexible array member}}
+  struct has_unannotated_vla* objects __counted_by(count);
+};
+
+struct has_annotated_vla {
+  int count;
+  int buffer[] __counted_by(count);
+};
+
+// Currently prevented because computing the size of `objects` at runtime would
+// require an O(N) walk of `objects` to take into account the length of the VLA
+// in each struct instance.
+struct on_member_pointer_struct_with_annotated_vla {
+  int count;
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_annotated_vla' is a struct type with a flexible array member}}
+  struct has_annotated_vla* objects __counted_by(count);
+};
+
+struct on_pointer_anon_buf {
+  int count;
+  struct {
+    struct size_known *buf __counted_by(count);
+  };
+};
+
+struct on_pointer_anon_count {
+  struct {
+    int count;
+  };
+  struct size_known *buf __counted_by(count);
+};
+
+//==============================================================================
+// __counted_by on struct member pointer in type attribute position
+//==============================================================================
+// TODO: Correctly parse counted_by as a type attribute. Currently it is parsed
+// as a declaration attribute
+
+struct on_member_pointer_complete_ty_ty_pos {
+  int count;
+  struct size_known *__counted_by(count) buf;
+};
+
+struct on_member_pointer_incomplete_ty_ty_pos {
+  int count;
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct size_unknown' is an incomplete type}}
+  struct size_unknown * __counted_by(count) buf;
+};
+
+struct on_member_pointer_const_incomplete_ty_ty_pos {
+  int count;
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'const struct size_unknown' is an incomplete type}}
+  const struct size_unknown * __counted_by(count) buf;
+};
+
+struct on_member_pointer_void_ty_ty_pos {
+  int count;
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void' is an incomplete type}}
+  void *__counted_by(count) buf;
+};
+
+// -
+
+struct on_member_pointer_fn_ptr_ty_pos {
+  int count;
+  // buffer of `count` function pointers is allowed
+  void (** __counted_by(count) fn_ptr)(void);
+};
+
+struct on_member_pointer_fn_ptr_ty_ptr_ty_pos {
+  int count;
+  // buffer of `count` function pointers is allowed
+  fn_ptr_ty* __counted_by(count) fn_ptr;
+};
+
+struct on_member_pointer_fn_ty_ty_pos {
+  int count;
+  // buffer of `count` functions is not allowed
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
+  void (* __counted_by(count) fn_ptr)(void);
+};
+
+struct on_member_pointer_fn_ptr_ty_ty_pos {
+  int count;
+  // buffer of `count` functions is not allowed
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'void (void)' is a function type}}
+  fn_ptr_ty __counted_by(count) fn_ptr;
+};
+
+// TODO: This should be forbidden but isn't due to counted_by being treated
+// as a declaration attribute.
+struct on_member_pointer_fn_ptr_ty_ty_pos_inner {
+  int count;
+  void (* __counted_by(count) * fn_ptr)(void);
+};
+
+struct on_member_pointer_struct_with_vla_ty_pos {
+  int count;
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_unannotated_vla' is a struct type with a flexible array member}}
+  struct has_unannotated_vla *__counted_by(count) objects;
+};
+
+// Currently prevented because computing the size of `objects` at runtime would
+// require an O(N) walk of `objects` to take into account the length of the VLA
+// in each struct instance.
+struct on_member_pointer_struct_with_annotated_vla_ty_pos {
+  int count;
+  // expected-error@+1{{counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct has_annotated_vla' is a struct type with a flexible array member}}
+  struct has_annotated_vla* __counted_by(count) objects;
+};
+
+struct on_nested_pointer_inner {
+  // TODO: This should be disallowed because in the `-fbounds-safety` model
+  // `__counted_by` can only be nested when used in function parameters.
+  int count;
+  struct size_known *__counted_by(count) *buf;
+};
+
+struct on_nested_pointer_outer {
+  int count;
+  struct size_known **__counted_by(count) buf;
+};
+
+struct on_pointer_anon_buf_ty_pos {
+  int count;
+  struct {
+    struct size_known * __counted_by(count) buf;
+  };
+};
+
+struct on_pointer_anon_count_ty_pos {
+  struct {
+    int count;
+  };
+  struct size_known *__counted_by(count) buf;
+};
+
+//==============================================================================
+// __counted_by on struct non-pointer members
+//==============================================================================
+
+struct on_pod_ty {
+  int count;
+  // expected-error@+1{{'counted_by' only applies to pointers or C99 flexible array members}}
+  int wrong_ty __counted_by(count);
+};
+
+struct on_void_ty {
+  int count;
+  // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
+  // expected-error@+1{{field has incomplete type 'void'}}
+  void wrong_ty __counted_by(count);
+};
diff --git a/clang/test/Sema/attr-counted-by-vla-sizeless-types.c b/clang/test/Sema/attr-counted-by-vla-sizeless-types.c
new file mode 100644
index 000000000000..31c0007501c4
--- /dev/null
+++ b/clang/test/Sema/attr-counted-by-vla-sizeless-types.c
@@ -0,0 +1,11 @@
+// __SVInt8_t is specific to ARM64 so specify that in the target triple
+// RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
+
+#define __counted_by(f)  __attribute__((counted_by(f)))
+
+struct on_sizeless_elt_ty {
+    int count;
+    // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
+    // expected-error@+1{{array has sizeless element type '__SVInt8_t'}}
+    __SVInt8_t arr[] __counted_by(count);
+};
diff --git a/clang/test/Sema/attr-counted-by-vla.c b/clang/test/Sema/attr-counted-by-vla.c
new file mode 100644
index 000000000000..b25f719f3b95
--- /dev/null
+++ b/clang/test/Sema/attr-counted-by-vla.c
@@ -0,0 +1,196 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+#define __counted_by(f)  __attribute__((counted_by(f)))
+
+struct bar;
+
+struct not_found {
+  int count;
+  struct bar *fam[] __counted_by(bork); // expected-error {{use of undeclared identifier 'bork'}}
+};
+
+struct no_found_count_not_in_substruct {
+  unsigned long flags;
+  unsigned char count; // expected-note {{'count' declared here}}
+  struct A {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
+  } a;
+};
+
+struct not_found_count_not_in_unnamed_substruct {
+  unsigned char count; // expected-note {{'count' declared here}}
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
+  } a;
+};
+
+struct not_found_count_not_in_unnamed_substruct_2 {
+  struct {
+    unsigned char count; // expected-note {{'count' declared here}}
+  };
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
+  } a;
+};
+
+struct not_found_count_in_other_unnamed_substruct {
+  struct {
+    unsigned char count;
+  } a1;
+
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
+  };
+};
+
+struct not_found_count_in_other_substruct {
+  struct _a1 {
+    unsigned char count;
+  } a1;
+
+  struct {
+    int dummy;
+    int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
+  };
+};
+
+struct not_found_count_in_other_substruct_2 {
+  struct _a2 {
+    unsigned char count;
+  } a2;
+
+  int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
+};
+
+struct not_found_suggest {
+  int bork;
+  struct bar *fam[] __counted_by(blork); // expected-error {{use of undeclared identifier 'blork'}}
+};
+
+int global; // expected-note {{'global' declared here}}
+
+struct found_outside_of_struct {
+  int bork;
+  struct bar *fam[] __counted_by(global); // expected-error {{field 'global' in 'counted_by' not inside structure}}
+};
+
+struct self_referrential {
+  int bork;
+  struct bar *self[] __counted_by(self); // expected-error {{use of undeclared identifier 'self'}}
+};
+
+struct non_int_count {
+  double dbl_count;
+  struct bar *fam[] __counted_by(dbl_count); // expected-error {{'counted_by' requires a non-boolean integer type argument}}
+};
+
+struct array_of_ints_count {
+  int integers[2];
+  struct bar *fam[] __counted_by(integers); // expected-error {{'counted_by' requires a non-boolean integer type argument}}
+};
+
+struct not_a_fam {
+  int count;
+  // expected-error@+1{{'counted_by' cannot be applied to a pointer with pointee of unknown size because 'struct bar' is an incomplete type}}
+  struct bar *non_fam __counted_by(count);
+};
+
+struct not_a_c99_fam {
+  int count;
+  struct bar *non_c99_fam[0] __counted_by(count); // expected-error {{'counted_by' on arrays only applies to C99 flexible array members}}
+};
+
+struct annotated_with_anon_struct {
+  unsigned long flags;
+  struct {
+    unsigned char count;
+    int array[] __counted_by(crount); // expected-error {{use of undeclared identifier 'crount'}}
+  };
+};
+
+//==============================================================================
+// __counted_by on a struct VLA with element type that has unknown size
+//==============================================================================
+
+struct size_unknown; // expected-note 2{{forward declaration of 'struct size_unknown'}}
+struct on_member_arr_incomplete_ty_ty_pos {
+  int count;
+  // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
+  // expected-error@+1{{array has incomplete element type 'struct size_unknown'}}
+  struct size_unknown buf[] __counted_by(count);
+};
+
+struct on_member_arr_incomplete_const_ty_ty_pos {
+  int count;
+  // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
+  // expected-error@+1{{array has incomplete element type 'const struct size_unknown'}}
+  const struct size_unknown buf[] __counted_by(count);
+};
+
+struct on_member_arr_void_ty_ty_pos {
+  int count;
+  // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
+  // expected-error@+1{{array has incomplete element type 'void'}}
+  void buf[] __counted_by(count);
+};
+
+typedef void(fn_ty)(int);
+
+struct on_member_arr_fn_ptr_ty {
+  int count;
+  // An Array of function pointers is allowed
+  fn_ty* buf[] __counted_by(count);
+};
+
+struct on_member_arr_fn_ty {
+  int count;
+  // An array of functions is not allowed.
+  // expected-error@+2{{'counted_by' only applies to pointers or C99 flexible array members}}
+  // expected-error@+1{{'buf' declared as array of functions of type 'fn_ty' (aka 'void (int)')}}
+  fn_ty buf[] __counted_by(count);
+};
+
+
+// `buffer_of_structs_with_unnannotated_vla`,
+// `buffer_of_structs_with_annotated_vla`, and
+// `buffer_of_const_structs_with_annotated_vla` are currently prevented because
+// computing the size of `Arr` at runtime would require an O(N) walk of `Arr`
+// elements to take into account the length of the VLA in each struct instance.
+
+struct has_unannotated_VLA {
+  int count;
+  char buffer[];
+};
+
+struct has_annotated_VLA {
+  int count;
+  char buffer[] __counted_by(count);
+};
+
+struct buffer_of_structs_with_unnannotated_vla {
+  int count;
+  // Treating this as a warning is a temporary fix for existing attribute adopters. It **SHOULD BE AN ERROR**.
+  // expected-warning@+1{{'counted_by' should not be applied to an array with element of unknown size because 'struct has_unannotated_VLA' is a struct type with a flexible array member. This will be an error in a future compiler version}}
+  struct has_unannotated_VLA Arr[] __counted_by(count);
+};
+
+
+struct buffer_of_structs_with_annotated_vla {
+  int count;
+  // Treating this as a warning is a temporary fix for existing attribute adopters. It **SHOULD BE AN ERROR**.
+  // expected-warning@+1{{'counted_by' should not be applied to an array with element of unknown size because 'struct has_annotated_VLA' is a struct type with a flexible array member. This will be an error in a future compiler version}}
+  struct has_annotated_VLA Arr[] __counted_by(count);
+};
+
+struct buffer_of_const_structs_with_annotated_vla {
+  int count;
+  // Treating this as a warning is a temporary fix for existing attribute adopters. It **SHOULD BE AN ERROR**.
+  // Make sure the `const` qualifier is printed when printing the element type.
+  // expected-warning@+1{{'counted_by' should not be applied to an array with element of unknown size because 'const struct has_annotated_VLA' is a struct type with a flexible array member. This will be an error in a future compiler version}}
+  const struct has_annotated_VLA Arr[] __counted_by(count);
+};
+
diff --git a/clang/test/Sema/attr-counted-by.c b/clang/test/Sema/attr-counted-by.c
deleted file mode 100644
index d5d4ebf55739..000000000000
--- a/clang/test/Sema/attr-counted-by.c
+++ /dev/null
@@ -1,112 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-
-#define __counted_by(f)  __attribute__((counted_by(f)))
-
-struct bar;
-
-struct not_found {
-  int count;
-  struct bar *fam[] __counted_by(bork); // expected-error {{use of undeclared identifier 'bork'}}
-};
-
-struct no_found_count_not_in_substruct {
-  unsigned long flags;
-  unsigned char count; // expected-note {{'count' declared here}}
-  struct A {
-    int dummy;
-    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
-  } a;
-};
-
-struct not_found_count_not_in_unnamed_substruct {
-  unsigned char count; // expected-note {{'count' declared here}}
-  struct {
-    int dummy;
-    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
-  } a;
-};
-
-struct not_found_count_not_in_unnamed_substruct_2 {
-  struct {
-    unsigned char count; // expected-note {{'count' declared here}}
-  };
-  struct {
-    int dummy;
-    int array[] __counted_by(count); // expected-error {{'counted_by' field 'count' isn't within the same struct as the flexible array}}
-  } a;
-};
-
-struct not_found_count_in_other_unnamed_substruct {
-  struct {
-    unsigned char count;
-  } a1;
-
-  struct {
-    int dummy;
-    int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
-  };
-};
-
-struct not_found_count_in_other_substruct {
-  struct _a1 {
-    unsigned char count;
-  } a1;
-
-  struct {
-    int dummy;
-    int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
-  };
-};
-
-struct not_found_count_in_other_substruct_2 {
-  struct _a2 {
-    unsigned char count;
-  } a2;
-
-  int array[] __counted_by(count); // expected-error {{use of undeclared identifier 'count'}}
-};
-
-struct not_found_suggest {
-  int bork;
-  struct bar *fam[] __counted_by(blork); // expected-error {{use of undeclared identifier 'blork'}}
-};
-
-int global; // expected-note {{'global' declared here}}
-
-struct found_outside_of_struct {
-  int bork;
-  struct bar *fam[] __counted_by(global); // expected-error {{field 'global' in 'counted_by' not inside structure}}
-};
-
-struct self_referrential {
-  int bork;
-  struct bar *self[] __counted_by(self); // expected-error {{use of undeclared identifier 'self'}}
-};
-
-struct non_int_count {
-  double dbl_count;
-  struct bar *fam[] __counted_by(dbl_count); // expected-error {{'counted_by' requires a non-boolean integer type argument}}
-};
-
-struct array_of_ints_count {
-  int integers[2];
-  struct bar *fam[] __counted_by(integers); // expected-error {{'counted_by' requires a non-boolean integer type argument}}
-};
-
-struct not_a_fam {
-  int count;
-  struct bar *non_fam __counted_by(count); // expected-error {{'counted_by' only applies to C99 flexible array members}}
-};
-
-struct not_a_c99_fam {
-  int count;
-  struct bar *non_c99_fam[0] __counted_by(count); // expected-error {{'counted_by' only applies to C99 flexible array members}}
-};
-
-struct annotated_with_anon_struct {
-  unsigned long flags;
-  struct {
-    unsigned char count;
-    int array[] __counted_by(crount); // expected-error {{use of undeclared identifier 'crount'}}
-  };
-};
diff --git a/clang/test/Sema/attr-objc-bridge-related.m b/clang/test/Sema/attr-objc-bridge-related.m
index 7b2e3e5df3fe..6c7fb2588dbc 100644
--- a/clang/test/Sema/attr-objc-bridge-related.m
+++ b/clang/test/Sema/attr-objc-bridge-related.m
@@ -3,5 +3,5 @@
 struct [[clang::objc_bridge_related(NSParagraphStyle,,)]] TestBridgedRef;
 
 struct [[clang::objc_bridge_related(NSColor,colorWithCGColor:,CGColor)]] CGColorRefOk;
-struct [[clang::objc_bridge_related(,colorWithCGColor:,CGColor)]] CGColorRef1NotOk; // expected-error {{expected a related ObjectiveC class name, e.g., 'NSColor'}}
+struct [[clang::objc_bridge_related(,colorWithCGColor:,CGColor)]] CGColorRef1NotOk; // expected-error {{expected a related Objective-C class name, e.g., 'NSColor'}}
 struct [[clang::objc_bridge_related(NSColor,colorWithCGColor::,CGColor)]] CGColorRef3NotOk; // expected-error {{expected a class method selector with single argument, e.g., 'colorWithCGColor:'}}
diff --git a/clang/test/Sema/builtins-x86.c b/clang/test/Sema/builtins-x86.c
index cbaf7bcde871..7d9cdce3d789 100644
--- a/clang/test/Sema/builtins-x86.c
+++ b/clang/test/Sema/builtins-x86.c
@@ -106,14 +106,6 @@ __m128i test_mm_mask_i32gather_epi32(__m128i a, int const *b, __m128i c, __m128i
   return __builtin_ia32_gatherd_d(a, b, c, mask, 5); // expected-error {{scale argument must be 1, 2, 4, or 8}}
 }
 
-void _mm512_mask_prefetch_i32gather_ps(__m512i index, __mmask16 mask, int const *addr) {
-  __builtin_ia32_gatherpfdps(mask, index, addr, 5, 1); // expected-error {{scale argument must be 1, 2, 4, or 8}}
-}
-
-void _mm512_mask_prefetch_i32gather_ps_2(__m512i index, __mmask16 mask, int const *addr) {
-  __builtin_ia32_gatherpfdps(mask, index, addr, 1, 1); // expected-error {{argument value 1 is outside the valid range [2, 3]}}
-}
-
 __m512i test_mm512_shldi_epi64(__m512i __A, __m512i __B) {
   return __builtin_ia32_vpshldq512(__A, __B, 1024); // expected-error {{argument value 1024 is outside the valid range [0, 255]}}
 }
diff --git a/clang/test/Sema/builtins.c b/clang/test/Sema/builtins.c
index 3bee31459529..4f843aeec24e 100644
--- a/clang/test/Sema/builtins.c
+++ b/clang/test/Sema/builtins.c
@@ -277,9 +277,9 @@ void test21(const int *ptr) {
 }
 
 void test_ei_i42i(_BitInt(42) *ptr, int value) {
-  __sync_fetch_and_add(ptr, value); // expected-error {{Atomic memory operand must have a power-of-two size}}
+  __sync_fetch_and_add(ptr, value); // expected-error {{atomic memory operand must have a power-of-two size}}
   // expected-warning@+1 {{the semantics of this intrinsic changed with GCC version 4.4 - the newer semantics are provided here}}
-  __sync_nand_and_fetch(ptr, value); // expected-error {{Atomic memory operand must have a power-of-two size}}
+  __sync_nand_and_fetch(ptr, value); // expected-error {{atomic memory operand must have a power-of-two size}}
 
   __atomic_fetch_add(ptr, 1, 0); // expected-error {{argument to atomic builtin of type '_BitInt' is not supported}}
 }
@@ -305,9 +305,9 @@ void test_ei_ii64(int *ptr, _BitInt(64) value) {
 }
 
 void test_ei_i42i42(_BitInt(42) *ptr, _BitInt(42) value) {
-  __sync_fetch_and_add(ptr, value); // expected-error {{Atomic memory operand must have a power-of-two size}}
+  __sync_fetch_and_add(ptr, value); // expected-error {{atomic memory operand must have a power-of-two size}}
   // expected-warning@+1 {{the semantics of this intrinsic changed with GCC version 4.4 - the newer semantics are provided here}}
-  __sync_nand_and_fetch(ptr, value); // expected-error {{Atomic memory operand must have a power-of-two size}}
+  __sync_nand_and_fetch(ptr, value); // expected-error {{atomic memory operand must have a power-of-two size}}
 }
 
 void test_ei_i64i64(_BitInt(64) *ptr, _BitInt(64) value) {
diff --git a/clang/test/Sema/constant_builtins_vector.cpp b/clang/test/Sema/constant_builtins_vector.cpp
index ddb78696ce62..c6b1b37cef28 100644
--- a/clang/test/Sema/constant_builtins_vector.cpp
+++ b/clang/test/Sema/constant_builtins_vector.cpp
@@ -719,7 +719,7 @@ constexpr vector4char
     vectorShuffleFail1 = // expected-error {{constexpr variable 'vectorShuffleFail1'\
  must be initialized by a constant expression}}
     __builtin_shufflevector( // expected-error {{index for __builtin_shufflevector \
-not within the bounds of the input vectors; index of -1 found at position 0 not \
-permitted in a constexpr context.}}
+not within the bounds of the input vectors; index of -1 found at position 0 is not \
+permitted in a constexpr context}}
         vector4charConst1,
         vector4charConst2, -1, -1, -1, -1);
diff --git a/clang/test/Sema/fmv-namespace.cpp b/clang/test/Sema/fmv-namespace.cpp
new file mode 100644
index 000000000000..1c12fd66cf24
--- /dev/null
+++ b/clang/test/Sema/fmv-namespace.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu  -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+namespace Name {
+int __attribute((target_version("default"))) foo() { return 0; }
+}
+
+namespace Name {
+int __attribute((target_version("sve"))) foo() { return 1; }
+}
+
+int bar() { return Name::foo(); }
diff --git a/clang/test/Sema/x86-eval-method.c b/clang/test/Sema/x86-eval-method.c
index f475b0d1b29b..e540a59528b6 100644
--- a/clang/test/Sema/x86-eval-method.c
+++ b/clang/test/Sema/x86-eval-method.c
@@ -10,9 +10,9 @@
 
 float add1(float a, float b, float c) {
   return a + b + c;
-} // warn-warning{{Setting the floating point evaluation method to `source` on a target without SSE is not supported.}}
+} // warn-warning{{setting the floating point evaluation method to `source` on a target without SSE is not supported}}
 
 float add2(float a, float b, float c) {
 #pragma clang fp eval_method(source)
   return a + b + c;
-} // warn-warning{{Setting the floating point evaluation method to `source` on a target without SSE is not supported.}}
+} // warn-warning{{setting the floating point evaluation method to `source` on a target without SSE is not supported}}
diff --git a/clang/test/Sema/x86_64-eval-method.c b/clang/test/Sema/x86_64-eval-method.c
index dbdc1f881b4a..fe4368a42ca1 100644
--- a/clang/test/Sema/x86_64-eval-method.c
+++ b/clang/test/Sema/x86_64-eval-method.c
@@ -10,4 +10,4 @@
 float add2(float a, float b, float c) {
 #pragma clang fp eval_method(source)
   return a + b + c;
-} // warn-warning{{Setting the floating point evaluation method to `source` on a target without SSE is not supported.}}
+} // warn-warning{{setting the floating point evaluation method to `source` on a target without SSE is not supported}}
diff --git a/clang/test/SemaCUDA/device-var-init.cu b/clang/test/SemaCUDA/device-var-init.cu
index ee7a9e2276f2..1555d151c259 100644
--- a/clang/test/SemaCUDA/device-var-init.cu
+++ b/clang/test/SemaCUDA/device-var-init.cu
@@ -13,17 +13,17 @@
 #include "Inputs/cuda-initializers.h"
 
 __shared__ int s_v_i = 1;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 
 __device__ int d_v_f = f();
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ int s_v_f = f();
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ int c_v_f = f();
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __shared__ T s_t_i = {2};
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __device__ T d_t_i = {2};
 __constant__ T c_t_i = {2};
 
@@ -40,175 +40,175 @@ __shared__ CGTC s_cgtc;
 __constant__ CGTC c_cgtc;
 
 __device__ EC d_ec_i(3);
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ EC s_ec_i(3);
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ EC c_ec_i(3);
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ EC d_ec_i2 = {3};
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ EC s_ec_i2 = {3};
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ EC c_ec_i2 = {3};
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ ETC d_etc_i(3);
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ ETC s_etc_i(3);
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ ETC c_etc_i(3);
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ ETC d_etc_i2 = {3};
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ ETC s_etc_i2 = {3};
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ ETC c_etc_i2 = {3};
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ UC d_uc;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ UC s_uc;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ UC c_uc;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ UD d_ud;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ UD s_ud;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ UD c_ud;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ ECI d_eci;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ ECI s_eci;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ ECI c_eci;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ NEC d_nec;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ NEC s_nec;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ NEC c_nec;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ NED d_ned;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ NED s_ned;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ NED c_ned;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ NCV d_ncv;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ NCV s_ncv;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ NCV c_ncv;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ VD d_vd;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ VD s_vd;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ VD c_vd;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ NCF d_ncf;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ NCF s_ncf;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ NCF c_ncf;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __shared__ NCFS s_ncfs;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 
 __device__ UTC d_utc;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ UTC s_utc;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ UTC c_utc;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ UTC d_utc_i(3);
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ UTC s_utc_i(3);
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ UTC c_utc_i(3);
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ NETC d_netc;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ NETC s_netc;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ NETC c_netc;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ NETC d_netc_i(3);
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ NETC s_netc_i(3);
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ NETC c_netc_i(3);
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ EC_I_EC1 d_ec_i_ec1;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ EC_I_EC1 s_ec_i_ec1;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ EC_I_EC1 c_ec_i_ec1;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ T_V_T d_t_v_t;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ T_V_T s_t_v_t;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ T_V_T c_t_v_t;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ T_B_NEC d_t_b_nec;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ T_B_NEC s_t_b_nec;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ T_B_NEC c_t_b_nec;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ T_F_NEC d_t_f_nec;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ T_F_NEC s_t_f_nec;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ T_F_NEC c_t_f_nec;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ T_FA_NEC d_t_fa_nec;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ T_FA_NEC s_t_fa_nec;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ T_FA_NEC c_t_fa_nec;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ T_B_NED d_t_b_ned;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ T_B_NED s_t_b_ned;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ T_B_NED c_t_b_ned;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ T_F_NED d_t_f_ned;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ T_F_NED s_t_f_ned;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ T_F_NED c_t_f_ned;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ T_FA_NED d_t_fa_ned;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __shared__ T_FA_NED s_t_fa_ned;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
 __constant__ T_FA_NED c_t_fa_ned;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 // Verify that local variables may be static on device
 // side and that they conform to the initialization constraints.
@@ -244,14 +244,14 @@ __device__ void df_sema() {
   // Same test cases as for the globals above.
 
   static __device__ int d_v_f = f();
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ int s_v_f = f();
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ int c_v_f = f();
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __shared__ T s_t_i = {2};
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __device__ T d_t_i = {2};
   static __constant__ T c_t_i = {2};
 
@@ -260,175 +260,175 @@ __device__ void df_sema() {
   static __constant__ ECD c_ecd_i;
 
   static __device__ EC d_ec_i(3);
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ EC s_ec_i(3);
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ EC c_ec_i(3);
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ EC d_ec_i2 = {3};
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ EC s_ec_i2 = {3};
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ EC c_ec_i2 = {3};
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ ETC d_etc_i(3);
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ ETC s_etc_i(3);
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ ETC c_etc_i(3);
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ ETC d_etc_i2 = {3};
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ ETC s_etc_i2 = {3};
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ ETC c_etc_i2 = {3};
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ UC d_uc;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ UC s_uc;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ UC c_uc;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ UD d_ud;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ UD s_ud;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ UD c_ud;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ ECI d_eci;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ ECI s_eci;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ ECI c_eci;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ NEC d_nec;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ NEC s_nec;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ NEC c_nec;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ NED d_ned;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ NED s_ned;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ NED c_ned;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ NCV d_ncv;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ NCV s_ncv;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ NCV c_ncv;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ VD d_vd;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ VD s_vd;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ VD c_vd;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ NCF d_ncf;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ NCF s_ncf;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ NCF c_ncf;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __shared__ NCFS s_ncfs;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
 
   static __device__ UTC d_utc;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ UTC s_utc;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ UTC c_utc;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ UTC d_utc_i(3);
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ UTC s_utc_i(3);
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ UTC c_utc_i(3);
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ NETC d_netc;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ NETC s_netc;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ NETC c_netc;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ NETC d_netc_i(3);
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ NETC s_netc_i(3);
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ NETC c_netc_i(3);
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ EC_I_EC1 d_ec_i_ec1;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ EC_I_EC1 s_ec_i_ec1;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ EC_I_EC1 c_ec_i_ec1;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ T_V_T d_t_v_t;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ T_V_T s_t_v_t;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ T_V_T c_t_v_t;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ T_B_NEC d_t_b_nec;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ T_B_NEC s_t_b_nec;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ T_B_NEC c_t_b_nec;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ T_F_NEC d_t_f_nec;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ T_F_NEC s_t_f_nec;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ T_F_NEC c_t_f_nec;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ T_FA_NEC d_t_fa_nec;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ T_FA_NEC s_t_fa_nec;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ T_FA_NEC c_t_fa_nec;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ T_B_NED d_t_b_ned;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ T_B_NED s_t_b_ned;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ T_B_NED c_t_b_ned;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ T_F_NED d_t_f_ned;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ T_F_NED s_t_f_ned;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ T_F_NED c_t_f_ned;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
   static __device__ T_FA_NED d_t_fa_ned;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
   static __shared__ T_FA_NED s_t_fa_ned;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   static __constant__ T_FA_NED c_t_fa_ned;
-  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+  // expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 }
 
 __host__ __device__ void hd_sema() {
@@ -449,7 +449,7 @@ struct NontrivialInitializer {
 template <typename T>
 __global__ void bar() {
   __shared__ T bad;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
   for (int i = 0; i < 10; i++) {
     static __device__ CEEC sd_ceec;
     static __shared__ CEEC ss_ceec;
@@ -467,7 +467,7 @@ __global__ void bar() {
 template <>
 __global__ void bar<int>() {
   __shared__ NontrivialInitializer bad;
-// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+// expected-error@-1 {{initialization is not supported for __shared__ variables}}
   for (int i = 0; i < 10; i++) {
     static __device__ CEEC sd_ceec;
     static __shared__ CEEC ss_ceec;
diff --git a/clang/test/SemaCUDA/function-overload.cu b/clang/test/SemaCUDA/function-overload.cu
index 163648cd9a87..4710c81763ad 100644
--- a/clang/test/SemaCUDA/function-overload.cu
+++ b/clang/test/SemaCUDA/function-overload.cu
@@ -469,7 +469,7 @@ int test_constexpr_overload(C2 &x, C2 &y) {
 // Verify no ambiguity for new operator.
 void *a = new int;
 __device__ void *b = new int;
-// expected-error@-1{{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1{{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 // Verify no ambiguity for new operator.
 template<typename _Tp> _Tp&& f();
diff --git a/clang/test/SemaCUDA/union-init.cu b/clang/test/SemaCUDA/union-init.cu
index 9e4d14a71069..dd4b1296b713 100644
--- a/clang/test/SemaCUDA/union-init.cu
+++ b/clang/test/SemaCUDA/union-init.cu
@@ -31,14 +31,14 @@ union D {
 
 __device__ B b;
 __device__ C c;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 __device__ D d;
-// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables.}}
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
 
 __device__ void foo() {
   __shared__ B b;
   __shared__ C c;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
   __shared__ D d;
-  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  // expected-error@-1 {{initialization is not supported for __shared__ variables}}
 }
diff --git a/clang/test/SemaCXX/MicrosoftExtensions.cpp b/clang/test/SemaCXX/MicrosoftExtensions.cpp
index 7286217b1644..98c19975095b 100644
--- a/clang/test/SemaCXX/MicrosoftExtensions.cpp
+++ b/clang/test/SemaCXX/MicrosoftExtensions.cpp
@@ -571,11 +571,17 @@ class PR34109_class {
   virtual ~PR34109_class() {}
 };
 
+#if !defined(__cpp_sized_deallocation)
 void operator delete(void *) throw();
 // expected-note@-1 {{previous declaration is here}}
 __declspec(dllexport) void operator delete(void *) throw();
 // expected-error@-1  {{redeclaration of 'operator delete' cannot add 'dllexport' attribute}}
-
+#else
+void operator delete(void *, unsigned int) throw();
+// expected-note@-1 {{previous declaration is here}}
+__declspec(dllexport) void operator delete(void *, unsigned int) throw();
+// expected-error@-1  {{redeclaration of 'operator delete' cannot add 'dllexport' attribute}}
+#endif
 void PR34109(int* a) {
   delete a;
 }
diff --git a/clang/test/SemaCXX/addr-label-in-coroutines.cpp b/clang/test/SemaCXX/addr-label-in-coroutines.cpp
index e37ee6413437..65d78636e5cd 100644
--- a/clang/test/SemaCXX/addr-label-in-coroutines.cpp
+++ b/clang/test/SemaCXX/addr-label-in-coroutines.cpp
@@ -13,9 +13,9 @@ struct resumable {
 };
 
 resumable f1(int &out, int *inst) {
-    static void* dispatch_table[] = {&&inc,      // expected-error {{the GNU address of label extension is not allowed in coroutines.}}
-                                     &&suspend,  // expected-error {{the GNU address of label extension is not allowed in coroutines.}}
-                                     &&stop};    // expected-error {{the GNU address of label extension is not allowed in coroutines.}}
+    static void* dispatch_table[] = {&&inc,      // expected-error {{the GNU address of label extension is not allowed in coroutines}}
+                                     &&suspend,  // expected-error {{the GNU address of label extension is not allowed in coroutines}}
+                                     &&stop};    // expected-error {{the GNU address of label extension is not allowed in coroutines}}
     #define DISPATCH() goto *dispatch_table[*inst++]
 inc:
     out++;
@@ -31,9 +31,9 @@ stop:
 
 resumable f2(int &out, int *inst) {
     void* dispatch_table[] = {nullptr, nullptr, nullptr};
-    dispatch_table[0] = &&inc;      // expected-error {{the GNU address of label extension is not allowed in coroutines.}}
-    dispatch_table[1] = &&suspend;  // expected-error {{the GNU address of label extension is not allowed in coroutines.}}
-    dispatch_table[2] = &&stop;     // expected-error {{the GNU address of label extension is not allowed in coroutines.}}
+    dispatch_table[0] = &&inc;      // expected-error {{the GNU address of label extension is not allowed in coroutines}}
+    dispatch_table[1] = &&suspend;  // expected-error {{the GNU address of label extension is not allowed in coroutines}}
+    dispatch_table[2] = &&stop;     // expected-error {{the GNU address of label extension is not allowed in coroutines}}
     #define DISPATCH() goto *dispatch_table[*inst++]
 inc:
     out++;
@@ -50,9 +50,9 @@ stop:
 resumable f3(int &out, int *inst) {
     void* dispatch_table[] = {nullptr, nullptr, nullptr};
     [&]() -> resumable {
-        dispatch_table[0] = &&inc;      // expected-error {{the GNU address of label extension is not allowed in coroutines.}}
-        dispatch_table[1] = &&suspend;  // expected-error {{the GNU address of label extension is not allowed in coroutines.}}
-        dispatch_table[2] = &&stop;     // expected-error {{the GNU address of label extension is not allowed in coroutines.}}
+        dispatch_table[0] = &&inc;      // expected-error {{the GNU address of label extension is not allowed in coroutines}}
+        dispatch_table[1] = &&suspend;  // expected-error {{the GNU address of label extension is not allowed in coroutines}}
+        dispatch_table[2] = &&stop;     // expected-error {{the GNU address of label extension is not allowed in coroutines}}
         #define DISPATCH() goto *dispatch_table[*inst++]
     inc:
         out++;
diff --git a/clang/test/SemaCXX/builtin-operator-new-delete.cpp b/clang/test/SemaCXX/builtin-operator-new-delete.cpp
index 6fcff92dc095..db15616803e3 100644
--- a/clang/test/SemaCXX/builtin-operator-new-delete.cpp
+++ b/clang/test/SemaCXX/builtin-operator-new-delete.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++1z -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++1z -fno-sized-deallocation -fsyntax-only -verify %s
 // RUN: %clang_cc1 -std=c++03 -fsyntax-only -verify %s
 // RUN: %clang_cc1 -std=c++03 -faligned-allocation -fsyntax-only -verify %s
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
diff --git a/clang/test/SemaCXX/constexpr-default-arg.cpp b/clang/test/SemaCXX/constexpr-default-arg.cpp
index ec9b2927880b..901123bfb359 100644
--- a/clang/test/SemaCXX/constexpr-default-arg.cpp
+++ b/clang/test/SemaCXX/constexpr-default-arg.cpp
@@ -32,8 +32,8 @@ void test_default_arg2() {
 }
 
 // Check that multiple CXXDefaultInitExprs don't cause an assertion failure.
-struct A { int &&r = 0; }; // expected-note 2{{default member initializer}}
+struct A { int &&r = 0; };
 struct B { A x, y; };
-B b = {}; // expected-warning 2{{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported}}
+B b = {}; // expected-no-diagnostics
 
 }
diff --git a/clang/test/SemaCXX/cxx11-default-member-initializers.cpp b/clang/test/SemaCXX/cxx11-default-member-initializers.cpp
index dd8e9c6b7fc1..1ea8b98cd863 100644
--- a/clang/test/SemaCXX/cxx11-default-member-initializers.cpp
+++ b/clang/test/SemaCXX/cxx11-default-member-initializers.cpp
@@ -27,6 +27,80 @@ class MemInit {
   C m = s;
 };
 
+namespace std {
+typedef decltype(sizeof(int)) size_t;
+
+// libc++'s implementation
+template <class _E> class initializer_list {
+  const _E *__begin_;
+  size_t __size_;
+
+  initializer_list(const _E *__b, size_t __s) : __begin_(__b), __size_(__s) {}
+
+public:
+  typedef _E value_type;
+  typedef const _E &reference;
+  typedef const _E &const_reference;
+  typedef size_t size_type;
+
+  typedef const _E *iterator;
+  typedef const _E *const_iterator;
+
+  initializer_list() : __begin_(nullptr), __size_(0) {}
+
+  size_t size() const { return __size_; }
+  const _E *begin() const { return __begin_; }
+  const _E *end() const { return __begin_ + __size_; }
+};
+} // namespace std
+
+#if __cplusplus >= 201703L
+namespace test_rebuild {
+template <typename T, int> class C {
+public:
+  C(std::initializer_list<T>);
+};
+
+template <typename T> using Ptr = __remove_pointer(T) *;
+template <typename T> C(T) -> C<Ptr<T>, sizeof(T)>;
+
+class A {
+public:
+  template <typename T1, typename T2> T1 *some_func(T2 &&);
+};
+
+struct B : A {
+  // Test CXXDefaultInitExpr rebuild issue in 
+  // https://github.com/llvm/llvm-project/pull/87933
+  int *ar = some_func<int>(C{some_func<int>(0)});
+  B() {}
+};
+
+int TestBody_got;
+template <int> class Vector {
+public:
+  Vector(std::initializer_list<int>);
+};
+template <typename... Ts> Vector(Ts...) -> Vector<sizeof...(Ts)>;
+class ProgramBuilder {
+public:
+  template <typename T, typename ARGS> int *create(ARGS);
+};
+
+struct TypeTest : ProgramBuilder {
+  int *str_f16 = create<int>(Vector{0});
+  TypeTest() {}
+};
+class TypeTest_Element_Test : TypeTest {
+  void TestBody();
+};
+void TypeTest_Element_Test::TestBody() {
+  int *expect = str_f16;
+  &TestBody_got != expect; // expected-warning {{inequality comparison result unused}}
+}
+} //  namespace test_rebuild
+#endif // __cplusplus >= 201703L
+
 #if __cplusplus >= 202002L
 // This test ensures cleanup expressions are correctly produced
 // in the presence of default member initializers.
diff --git a/clang/test/SemaCXX/cxx1y-sized-deallocation.cpp b/clang/test/SemaCXX/cxx1y-sized-deallocation.cpp
index 3ec65a6a64d1..462f1725bb1c 100644
--- a/clang/test/SemaCXX/cxx1y-sized-deallocation.cpp
+++ b/clang/test/SemaCXX/cxx1y-sized-deallocation.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++1y -verify %s -fsized-deallocation -fexceptions -fcxx-exceptions
+// RUN: %clang_cc1 -std=c++1y -verify %s -fexceptions -fcxx-exceptions
 
 using size_t = decltype(sizeof(0));
 void operator delete(void *, size_t) noexcept;   // expected-note {{'operator delete' declared here}}
diff --git a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
index 4c6ef5adae7d..b71dfc6ccaf4 100644
--- a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
+++ b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
@@ -284,7 +284,7 @@ class Foo {};
 // Verify that template template type parameter TTP is referenced/used in the
 // template arguments of the RHS.
 template <template<typename> typename TTP>
-using Bar = Foo<K<TTP>>; // expected-note {{candidate template ignored: could not match 'Foo<K<>>' against 'int'}}
+using Bar = Foo<K<TTP>>; // expected-note {{candidate template ignored: could not match 'Foo<K<template-parameter-0-0>>' against 'int'}}
 
 template <class T>
 class Container {};
diff --git a/clang/test/SemaCXX/cxx23-assume.cpp b/clang/test/SemaCXX/cxx23-assume.cpp
index e67d72ae0a99..ea71e7b25182 100644
--- a/clang/test/SemaCXX/cxx23-assume.cpp
+++ b/clang/test/SemaCXX/cxx23-assume.cpp
@@ -58,6 +58,11 @@ void g(int x) {
   [[assume(true)]] while (false) {} // expected-error {{only applies to empty statements}}
   [[assume(true)]] label:; // expected-error {{cannot be applied to a declaration}}
   [[assume(true)]] goto label; // expected-error {{only applies to empty statements}}
+
+  // Also check variant spellings.
+  __attribute__((__assume__(true))); // Should not issue a warning because it doesn't use the [[]] spelling.
+  __attribute__((assume(true))) {}; // expected-error {{only applies to empty statements}}
+  [[clang::assume(true)]] {}; // expected-error {{only applies to empty statements}}
 }
 
 // Check that 'x' is ODR-used here.
@@ -143,3 +148,13 @@ template <bool ...val>
 void f() {
     [[assume(val)]]; // expected-error {{expression contains unexpanded parameter pack}}
 }
+
+namespace gh71858 {
+int
+foo (int x, int y)
+{
+  __attribute__((assume(x == 42)));
+  __attribute__((assume(++y == 43))); // expected-warning {{has side effects that will be discarded}}
+  return x + y;
+}
+}
diff --git a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
index 07937deb6673..b70c02201ac3 100644
--- a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
+++ b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
@@ -446,3 +446,11 @@ int h(int x) {
 }
 
 #endif
+
+
+namespace GH91308 {
+    constexpr void f(auto) {
+        static_assert(false);
+    }
+    using R1 = decltype(&f<int>);
+}
diff --git a/clang/test/SemaCXX/eval-crashes.cpp b/clang/test/SemaCXX/eval-crashes.cpp
index 017df977b26b..a06f60f71e9c 100644
--- a/clang/test/SemaCXX/eval-crashes.cpp
+++ b/clang/test/SemaCXX/eval-crashes.cpp
@@ -25,11 +25,9 @@ namespace pr33140_0b {
 }
 
 namespace pr33140_2 {
-  // FIXME: The declaration of 'b' below should lifetime-extend two int
-  // temporaries.
-  struct A { int &&r = 0; }; // expected-note 2{{initializing field 'r' with default member initializer}}
+  struct A { int &&r = 0; };
   struct B { A x, y; };
-  B b = {}; // expected-warning 2{{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported}}
+  B b = {};
 }
 
 namespace pr33140_3 {
diff --git a/clang/test/SemaCXX/unavailable_aligned_allocation.cpp b/clang/test/SemaCXX/unavailable_aligned_allocation.cpp
index be593eafe11d..45fdec606ad1 100644
--- a/clang/test/SemaCXX/unavailable_aligned_allocation.cpp
+++ b/clang/test/SemaCXX/unavailable_aligned_allocation.cpp
@@ -75,7 +75,7 @@ void testOveraligned() {
 // expected-error-re@-22 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}}
 // expected-note@-23 {{if you supply your own aligned allocation functions}}
 
-// expected-error-re@-24 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}}
+// expected-error-re@-24 {{aligned deallocation function of type 'void (void *, std::size_t, std::align_val_t) noexcept' is {{only|not}} available on}}
 // expected-note@-25 {{if you supply your own aligned allocation functions}}
 
 // expected-error-re@-26 {{aligned allocation function of type 'void *(std::size_t, std::align_val_t, const std::nothrow_t &) noexcept' is {{only|not}} available on}}
@@ -143,19 +143,19 @@ OveralignedS2::~OveralignedS2() {}
 // expected-no-diagnostics
 #else
 #if defined(IOS)
-// expected-error@-6 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on iOS 11 or newer}}}
+// expected-error@-6 {{aligned deallocation function of type 'void (void *, std::size_t, std::align_val_t) noexcept' is only available on iOS 11 or newer}}}
 // expected-note@-7 {{if you supply your own aligned allocation functions}}
 #elif defined(TVOS)
-// expected-error@-9 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on tvOS 11 or newer}}}
+// expected-error@-9 {{aligned deallocation function of type 'void (void *, std::size_t, std::align_val_t) noexcept' is only available on tvOS 11 or newer}}}
 // expected-note@-10 {{if you supply your own aligned allocation functions}}
 #elif defined(WATCHOS)
-// expected-error@-12 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on watchOS 4 or newer}}}
+// expected-error@-12 {{aligned deallocation function of type 'void (void *, std::size_t, std::align_val_t) noexcept' is only available on watchOS 4 or newer}}}
 // expected-note@-13 {{if you supply your own aligned allocation functions}}
 #elif defined(MACOS)
-// expected-error@-15 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on macOS 10.13 or newer}}}
+// expected-error@-15 {{aligned deallocation function of type 'void (void *, std::size_t, std::align_val_t) noexcept' is only available on macOS 10.13 or newer}}}
 // expected-note@-16 {{if you supply your own aligned allocation functions}}
 #elif defined(ZOS)
-// expected-error@-18 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is not available on z/OS}}}
+// expected-error@-18 {{aligned deallocation function of type 'void (void *, std::size_t, std::align_val_t) noexcept' is not available on z/OS}}}
 // expected-note@-19 {{if you supply your own aligned allocation functions}}
 #endif
 #endif
@@ -209,6 +209,9 @@ void *operator new(std::size_t __sz, std::align_val_t) {
 void operator delete(void *p, std::align_val_t) {
 }
 
+void operator delete(void *p, std::size_t __sz, std::align_val_t) {
+}
+
 void testOveraligned2() {
   auto p = new ((std::align_val_t)8) OveralignedS;
   delete p;
diff --git a/clang/test/SemaCXX/warn-thread-safety-analysis.cpp b/clang/test/SemaCXX/warn-thread-safety-analysis.cpp
index 749d9e135d94..73cc946ca0ce 100644
--- a/clang/test/SemaCXX/warn-thread-safety-analysis.cpp
+++ b/clang/test/SemaCXX/warn-thread-safety-analysis.cpp
@@ -5838,12 +5838,12 @@ class Foo5 {
 
 
 class Foo6 {
-  Mutex mu1 ACQUIRED_AFTER(mu3);     // expected-warning {{Cycle in acquired_before/after dependencies, starting with 'mu1'}}
-  Mutex mu2 ACQUIRED_AFTER(mu1);     // expected-warning {{Cycle in acquired_before/after dependencies, starting with 'mu2'}}
-  Mutex mu3 ACQUIRED_AFTER(mu2);     // expected-warning {{Cycle in acquired_before/after dependencies, starting with 'mu3'}}
+  Mutex mu1 ACQUIRED_AFTER(mu3);     // expected-warning {{cycle in acquired_before/after dependencies, starting with 'mu1'}}
+  Mutex mu2 ACQUIRED_AFTER(mu1);     // expected-warning {{cycle in acquired_before/after dependencies, starting with 'mu2'}}
+  Mutex mu3 ACQUIRED_AFTER(mu2);     // expected-warning {{cycle in acquired_before/after dependencies, starting with 'mu3'}}
 
-  Mutex mu_b ACQUIRED_BEFORE(mu_b);  // expected-warning {{Cycle in acquired_before/after dependencies, starting with 'mu_b'}}
-  Mutex mu_a ACQUIRED_AFTER(mu_a);   // expected-warning {{Cycle in acquired_before/after dependencies, starting with 'mu_a'}}
+  Mutex mu_b ACQUIRED_BEFORE(mu_b);  // expected-warning {{cycle in acquired_before/after dependencies, starting with 'mu_b'}}
+  Mutex mu_a ACQUIRED_AFTER(mu_a);   // expected-warning {{cycle in acquired_before/after dependencies, starting with 'mu_a'}}
 
   void test0() {
     mu_a.Lock();
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-pragma-misuse.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-pragma-misuse.cpp
index 126257e0fc47..106661491800 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-pragma-misuse.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-pragma-misuse.cpp
@@ -18,8 +18,8 @@ void endUnopened(int *x) {
 }
 
 void wrongOption() {
-#pragma clang unsafe_buffer_usage start // expected-error{{Expected 'begin' or 'end'}}
-#pragma clang unsafe_buffer_usage close // expected-error{{Expected 'begin' or 'end'}}
+#pragma clang unsafe_buffer_usage start // expected-error{{expected 'begin' or 'end'}}
+#pragma clang unsafe_buffer_usage close // expected-error{{expected 'begin' or 'end'}}
 }
 
 void unclosed(int * p1) {
diff --git a/clang/test/SemaObjC/unguarded-availability.m b/clang/test/SemaObjC/unguarded-availability.m
index d0e23eabcb59..ecd91990174a 100644
--- a/clang/test/SemaObjC/unguarded-availability.m
+++ b/clang/test/SemaObjC/unguarded-availability.m
@@ -177,16 +177,28 @@ void justAtAvailable(void) {
 
 #ifdef OBJCPP
 
-int f(char) AVAILABLE_10_12;
+int f(char) AVAILABLE_10_12; // #f_char_def
 int f(int);
 
 template <class T> int use_f() {
-  // FIXME: We should warn here!
-  return f(T());
+  if (@available(macos 10.12, *)) {
+    return f(T()); // no warning expected
+  } else {
+  // expected-warning@#f_call {{'f' is only available on macOS 10.12 or newer}}
+  // expected-note@#f_char_inst {{in instantiation of function template specialization 'use_f<char>' requested here}}
+  // expected-note@#f_char_def {{'f' has been marked as being introduced in macOS 10.12 here, but the deployment target is macOS 10.9}}
+  // expected-note@#f_call {{enclose 'f' in an @available check to silence this warning}}
+    return f(T()); // #f_call
+  }
 }
 
 int a = use_f<int>();
-int b = use_f<char>();
+int b = use_f<char>(); // #f_char_inst
+
+int use_f2() AVAILABLE_10_12 {
+  int c = use_f<int>();
+  int d = use_f<char>(); // no warning expected
+}
 
 template <class> int use_at_available() {
   if (@available(macos 10.12, *))
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl b/clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl
new file mode 100644
index 000000000000..487cc53e8ad8
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -verify -o - %s
+// REQUIRES: amdgpu-registered-target
+
+typedef unsigned int u32;
+
+void test_global_load_lds_unsupported_size(global u32* src, local u32 *dst, u32 size) {
+  __builtin_amdgcn_global_load_lds(src, dst, size, /*offset=*/0, /*aux=*/0); // expected-error{{expression is not an integer constant expression}}
+  __builtin_amdgcn_global_load_lds(src, dst, /*size=*/5, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} expected-note {{size must be 1, 2, or 4}}
+  __builtin_amdgcn_global_load_lds(src, dst, /*size=*/0, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} expected-note {{size must be 1, 2, or 4}}
+  __builtin_amdgcn_global_load_lds(src, dst, /*size=*/3, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} expected-note {{size must be 1, 2, or 4}}
+  __builtin_amdgcn_global_load_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} expected-note {{size must be 1, 2, or 4}}
+  __builtin_amdgcn_global_load_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} expected-note {{size must be 1, 2, or 4}}
+  __builtin_amdgcn_global_load_lds(src, dst, /*size=*/-1, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} expected-note {{size must be 1, 2, or 4}}
+}
diff --git a/clang/test/SemaOpenCL/vector_swizzle_length.cl b/clang/test/SemaOpenCL/vector_swizzle_length.cl
index f36ae201205e..b06cc126c3ec 100644
--- a/clang/test/SemaOpenCL/vector_swizzle_length.cl
+++ b/clang/test/SemaOpenCL/vector_swizzle_length.cl
@@ -5,6 +5,6 @@ typedef float float8 __attribute__((ext_vector_type(8)));
 void foo(void) {
     float8 f2 = (float8)(0, 0, 0, 0, 0, 0, 0, 0);
 
-    f2.s01234; // expected-error {{vector component access has invalid length 5.  Supported: 1,2,3,4,8,16}}
-    f2.xyzxy; // expected-error {{vector component access has invalid length 5.  Supported: 1,2,3,4,8,16}}
+    f2.s01234; // expected-error {{vector component access has invalid length 5; supported lengths are: 1,2,3,4,8,16}}
+    f2.xyzxy; // expected-error {{vector component access has invalid length 5; supported lengths are: 1,2,3,4,8,16}}
 }
diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp
index a91ab5ec7bcc..c38b647e42f4 100644
--- a/clang/test/SemaTemplate/deduction-guide.cpp
+++ b/clang/test/SemaTemplate/deduction-guide.cpp
@@ -100,11 +100,11 @@ using CT = C<int>;
 // CHECK: | `-NonTypeTemplateParmDecl {{.*}} 'X' depth 1 index 1
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 2 U
 // CHECK: |-NonTypeTemplateParmDecl {{.*}} 'type-parameter-0-2' depth 0 index 3 V
-// CHECK: | `-TemplateArgument expr
+// CHECK: | `-TemplateArgument {{.*}} expr
 // CHECK: |   `-IntegerLiteral {{.*}} 'int' 0
-// CHECK: |-CXXDeductionGuideDecl {{.*}} 'auto (A, Y<>, type-parameter-0-2) -> C<A>'
+// CHECK: |-CXXDeductionGuideDecl {{.*}} 'auto (A, Y<template-parameter-0-1>, type-parameter-0-2) -> C<A>'
 // CHECK: | |-ParmVarDecl {{.*}} 'A'
-// CHECK: | |-ParmVarDecl {{.*}} 'Y<>'
+// CHECK: | |-ParmVarDecl {{.*}} 'Y<template-parameter-0-1>'
 // CHECK: | `-ParmVarDecl {{.*}} 'type-parameter-0-2'
 // CHECK: `-CXXDeductionGuideDecl {{.*}} 'auto (int, Y<B>, int) -> C<int>'
 // CHECK:  |-TemplateArgument type 'int'
@@ -114,12 +114,12 @@ using CT = C<int>;
 // CHECK:  |-ParmVarDecl {{.*}} 'int'
 // CHECK:  |-ParmVarDecl {{.*}} 'Y<B>'
 // CHECK:  `-ParmVarDecl {{.*}} 'int'
-// CHECK: FunctionProtoType {{.*}} 'auto (A, Y<>, type-parameter-0-2) -> C<A>' dependent trailing_return cdecl
+// CHECK: FunctionProtoType {{.*}} 'auto (A, Y<template-parameter-0-1>, type-parameter-0-2) -> C<A>' dependent trailing_return cdecl
 // CHECK: |-InjectedClassNameType {{.*}} 'C<A>' dependent
 // CHECK: |-TemplateTypeParmType {{.*}} 'A' dependent depth 0 index 0
 // CHECK: | `-TemplateTypeParm {{.*}} 'A'
-// CHECK: |-ElaboratedType {{.*}} 'Y<>' sugar dependent
-// CHECK: | `-TemplateSpecializationType {{.*}} 'Y<>' dependent Y
+// CHECK: |-ElaboratedType {{.*}} 'Y<template-parameter-0-1>' sugar dependent
+// CHECK: | `-TemplateSpecializationType {{.*}} 'Y<template-parameter-0-1>' dependent Y
 // CHECK: |   `-TemplateArgument template
 // CHECK: `-TemplateTypeParmType {{.*}} 'type-parameter-0-2' dependent depth 0 index 2
 
@@ -139,7 +139,7 @@ using DT = D<int, int>;
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 0 ... T
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 1 U1
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 2 U2
-// CHECK: `-CXXDeductionGuideDecl {{.*}} 'auto (B<type-parameter-0-1, type-parameter-0-2> *) -> D<T...>'  
+// CHECK: `-CXXDeductionGuideDecl {{.*}} 'auto (B<type-parameter-0-1, type-parameter-0-2> *) -> D<T...>'
 // CHECK:   `-ParmVarDecl {{.*}} 'B<type-parameter-0-1, type-parameter-0-2> *'
 // CHECK: FunctionProtoType {{.*}} 'auto (B<type-parameter-0-1, type-parameter-0-2> *) -> D<T...>' dependent trailing_return
 // CHECK: |-InjectedClassNameType {{.*}} 'D<T...>' dependent
@@ -222,7 +222,7 @@ F s(0);
 // CHECK-LABEL: Dumping <deduction guide for F>:
 // CHECK: FunctionTemplateDecl
 // CHECK: |-NonTypeTemplateParmDecl {{.*}} 'char' depth 0 index 0
-// CHECK:   `-TemplateArgument expr
+// CHECK:   `-TemplateArgument {{.*}} expr
 // CHECK: |   |-inherited from NonTypeTemplateParm {{.*}} '' 'char'
 // CHECK: |   `-CharacterLiteral {{.*}} 'char' 120
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 1 U
diff --git a/clang/test/SemaTemplate/make_integer_seq.cpp b/clang/test/SemaTemplate/make_integer_seq.cpp
index 3a692f5ae2bf..c5a1e2705368 100644
--- a/clang/test/SemaTemplate/make_integer_seq.cpp
+++ b/clang/test/SemaTemplate/make_integer_seq.cpp
@@ -61,7 +61,7 @@ using test2 = B<int, 1>;
 
 template <template <class T, T...> class S, class T, int N> struct C {
   using test3 = __make_integer_seq<S, T, N>;
-//      CHECK: |-TypeAliasDecl 0x{{[0-9A-Fa-f]+}} <line:63:3, col:43> col:9 test3 '__make_integer_seq<S, T, N>':'__make_integer_seq<type-parameter-0-1, N>'
+//      CHECK: |-TypeAliasDecl 0x{{[0-9A-Fa-f]+}} <line:63:3, col:43> col:9 test3 '__make_integer_seq<S, T, N>':'__make_integer_seq<template-parameter-0-0, type-parameter-0-1, N>'
 // CHECK-NEXT:   `-ElaboratedType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<S, T, N>' sugar dependent
 // CHECK-NEXT:     `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<S, T, N>' sugar dependent alias __make_integer_seq
 // CHECK-NEXT:       |-TemplateArgument template S
@@ -71,7 +71,7 @@ template <template <class T, T...> class S, class T, int N> struct C {
 // CHECK-NEXT:       |-TemplateArgument expr
 // CHECK-NEXT:       | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:42> 'T' <Dependent>
 // CHECK-NEXT:       |   `-DeclRefExpr 0x{{[0-9A-Fa-f]+}} <col:42> 'int' NonTypeTemplateParm 0x{{[0-9A-Fa-f]+}} 'N' 'int'
-// CHECK-NEXT:       `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<type-parameter-0-1, N>' dependent __make_integer_seq
+// CHECK-NEXT:       `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<template-parameter-0-0, type-parameter-0-1, N>' dependent __make_integer_seq
 // CHECK-NEXT:         |-TemplateArgument template
 // CHECK-NEXT:         |-TemplateArgument type 'type-parameter-0-1'
 // CHECK-NEXT:         | `-TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-1' dependent depth 0 index 1
diff --git a/clang/tools/clang-repl/CMakeLists.txt b/clang/tools/clang-repl/CMakeLists.txt
index d3dec1984b78..4017b1445da0 100644
--- a/clang/tools/clang-repl/CMakeLists.txt
+++ b/clang/tools/clang-repl/CMakeLists.txt
@@ -11,6 +11,49 @@ add_clang_tool(clang-repl
   ClangRepl.cpp
   )
 
+if(MSVC)
+  set_target_properties(clang-repl PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS 1)
+
+  # RTTI/C++ symbols
+  set(clang_repl_exports ${clang_repl_exports} ??_7type_info@@6B@
+    ?__type_info_root_node@@3U__type_info_node@@A
+    ?nothrow@std@@3Unothrow_t@1@B
+  )
+
+  # Compiler added symbols for static variables. NOT for VStudio < 2015
+  set(clang_repl_exports ${clang_repl_exports} _Init_thread_abort _Init_thread_epoch
+    _Init_thread_footer _Init_thread_header _tls_index
+  )
+
+  if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+    # new/delete variants needed when linking to static msvc runtime (esp. Debug)
+    set(clang_repl_exports ${clang_repl_exports}
+      ??2@YAPEAX_K@Z
+      ??3@YAXPEAX@Z
+      ??_U@YAPEAX_K@Z
+      ??_V@YAXPEAX@Z
+      ??3@YAXPEAX_K@Z
+    )
+  else()
+    set(clang_repl_exports ${clang_repl_exports}
+      ??2@YAPAXI@Z
+      ??3@YAXPAX@Z
+      ??3@YAXPAXI@Z
+      ??_U@YAPAXI@Z
+      ??_V@YAXPAX@Z
+      ??_V@YAXPAXI@Z
+    )
+  endif()
+
+  # List to '/EXPORT:sym0 /EXPORT:sym1 /EXPORT:sym2 ...'
+  foreach(sym ${clang_repl_exports})
+    set(clang_repl_link_str "${clang_repl_link_str} /EXPORT:${sym}")
+  endforeach(sym ${clang_repl_exports})
+
+  set_property(TARGET clang-repl APPEND_STRING PROPERTY LINK_FLAGS ${clang_repl_link_str})
+
+endif(MSVC)
+
 clang_target_link_libraries(clang-repl PRIVATE
   clangAST
   clangBasic
diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
index f42af7e330e1..036e57c8d213 100644
--- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp
+++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
@@ -86,6 +86,8 @@ static bool DeprecatedDriverCommand;
 static ResourceDirRecipeKind ResourceDirRecipe;
 static bool Verbose;
 static bool PrintTiming;
+static llvm::BumpPtrAllocator Alloc;
+static llvm::StringSaver Saver{Alloc};
 static std::vector<const char *> CommandLine;
 
 #ifndef NDEBUG
@@ -99,8 +101,6 @@ static bool RoundTripArgs = DoRoundTripDefault;
 static void ParseArgs(int argc, char **argv) {
   ScanDepsOptTable Tbl;
   llvm::StringRef ToolName = argv[0];
-  llvm::BumpPtrAllocator Alloc;
-  llvm::StringSaver Saver{Alloc};
   llvm::opt::InputArgList Args =
       Tbl.parseArgs(argc, argv, OPT_UNKNOWN, Saver, [&](StringRef Msg) {
         llvm::errs() << Msg << '\n';
@@ -792,6 +792,11 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) {
 
   llvm::cl::PrintOptionValues();
 
+  // Expand response files in advance, so that we can "see" all the arguments
+  // when adjusting below.
+  Compilations = expandResponseFiles(std::move(Compilations),
+                                     llvm::vfs::getRealFileSystem());
+
   // The command options are rewritten to run Clang in preprocessor only mode.
   auto AdjustingCompilations =
       std::make_unique<tooling::ArgumentsAdjustingCompilations>(
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index f00ba9e3acfc..49ed60d990ca 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -776,10 +776,9 @@ bool CursorVisitor::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
   }
 
   // Visit the default argument.
-  if (D->hasDefaultArgument() && !D->defaultArgumentWasInherited())
-    if (TypeSourceInfo *DefArg = D->getDefaultArgumentInfo())
-      if (Visit(DefArg->getTypeLoc()))
-        return true;
+  if (D->hasDefaultArgument() && !D->defaultArgumentWasInherited() &&
+      VisitTemplateArgumentLoc(D->getDefaultArgument()))
+    return true;
 
   return false;
 }
@@ -946,8 +945,9 @@ bool CursorVisitor::VisitNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D) {
     return true;
 
   if (D->hasDefaultArgument() && !D->defaultArgumentWasInherited())
-    if (Expr *DefArg = D->getDefaultArgument())
-      return Visit(MakeCXCursor(DefArg, StmtParent, TU, RegionOfInterest));
+    if (D->hasDefaultArgument() &&
+        VisitTemplateArgumentLoc(D->getDefaultArgument()))
+      return true;
 
   return false;
 }
diff --git a/clang/tools/scan-build-py/tests/functional/exec/CMakeLists.txt b/clang/tools/scan-build-py/tests/functional/exec/CMakeLists.txt
index 95c6fdb610e0..cb6ebda18372 100644
--- a/clang/tools/scan-build-py/tests/functional/exec/CMakeLists.txt
+++ b/clang/tools/scan-build-py/tests/functional/exec/CMakeLists.txt
@@ -2,11 +2,7 @@ project(exec C)
 
 cmake_minimum_required(VERSION 3.20.0)
 
-include(CheckCCompilerFlag)
-check_c_compiler_flag("-std=c99" C99_SUPPORTED)
-if (C99_SUPPORTED)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99")
-endif()
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99")
 
 include(CheckFunctionExists)
 include(CheckSymbolExists)
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 4ee64de697d3..3dc1c336365d 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -1188,7 +1188,7 @@ TEST_P(ASTImporterOptionSpecificTestBase, TemplateTypeParmDeclDefaultArg) {
       FromTU, templateTypeParmDecl(hasName("T")));
   TemplateTypeParmDecl *To = Import(From, Lang_CXX03);
   ASSERT_TRUE(To->hasDefaultArgument());
-  QualType ToArg = To->getDefaultArgument();
+  QualType ToArg = To->getDefaultArgument().getArgument().getAsType();
   ASSERT_EQ(ToArg, QualType(To->getASTContext().IntTy));
 }
 
@@ -1260,7 +1260,7 @@ TEST_P(ASTImporterOptionSpecificTestBase, NonTypeTemplateParmDeclDefaultArg) {
       FromTU, nonTypeTemplateParmDecl(hasName("S")));
   NonTypeTemplateParmDecl *To = Import(From, Lang_CXX03);
   ASSERT_TRUE(To->hasDefaultArgument());
-  Stmt *ToArg = To->getDefaultArgument();
+  Stmt *ToArg = To->getDefaultArgument().getArgument().getAsExpr();
   ASSERT_TRUE(isa<IntegerLiteral>(ToArg));
   ASSERT_EQ(cast<IntegerLiteral>(ToArg)->getValue().getLimitedValue(), 1U);
 }
diff --git a/clang/unittests/AST/DeclTest.cpp b/clang/unittests/AST/DeclTest.cpp
index 2530ce74eb6a..16aa2b50b7a0 100644
--- a/clang/unittests/AST/DeclTest.cpp
+++ b/clang/unittests/AST/DeclTest.cpp
@@ -545,3 +545,34 @@ TEST(Decl, TemplateArgumentDefaulted) {
   EXPECT_TRUE(ArgList.get(2).getIsDefaulted());
   EXPECT_TRUE(ArgList.get(3).getIsDefaulted());
 }
+
+TEST(Decl, CXXDestructorDeclsShouldHaveWellFormedNameInfoRanges) {
+  // GH71161
+  llvm::Annotations Code(R"cpp(
+template <typename T> struct Resource {
+  ~Resource(); // 1
+};
+template <typename T>
+Resource<T>::~Resource() {} // 2,3
+
+void instantiate_template() {
+  Resource<int> x;
+}
+)cpp");
+
+  auto AST = tooling::buildASTFromCode(Code.code());
+  ASTContext &Ctx = AST->getASTContext();
+
+  const auto &SM = Ctx.getSourceManager();
+  auto GetNameInfoRange = [&SM](const BoundNodes &Match) {
+    const auto *D = Match.getNodeAs<CXXDestructorDecl>("dtor");
+    return D->getNameInfo().getSourceRange().printToString(SM);
+  };
+
+  auto Matches = match(findAll(cxxDestructorDecl().bind("dtor")),
+                       *Ctx.getTranslationUnitDecl(), Ctx);
+  ASSERT_EQ(Matches.size(), 3U);
+  EXPECT_EQ(GetNameInfoRange(Matches[0]), "<input.cc:3:3, col:4>");
+  EXPECT_EQ(GetNameInfoRange(Matches[1]), "<input.cc:6:14, col:15>");
+  EXPECT_EQ(GetNameInfoRange(Matches[2]), "<input.cc:6:14, col:15>");
+}
diff --git a/clang/unittests/Driver/DXCModeTest.cpp b/clang/unittests/Driver/DXCModeTest.cpp
index 416723d498a2..41ab30bc81d5 100644
--- a/clang/unittests/Driver/DXCModeTest.cpp
+++ b/clang/unittests/Driver/DXCModeTest.cpp
@@ -156,9 +156,10 @@ TEST(DxcModeTest, ValidatorVersionValidation) {
   TranslatedArgs.reset(
       TC.TranslateArgs(*DAL, "0", Action::OffloadKind::OFK_None));
   EXPECT_EQ(Diags.getNumErrors(), 1u);
-  EXPECT_STREQ(DiagConsumer->Errors.back().c_str(),
-               "invalid validator version : 0.1\nIf validator major version is "
-               "0, minor version must also be 0.");
+  EXPECT_STREQ(
+      DiagConsumer->Errors.back().c_str(),
+      "invalid validator version : 0.1; if validator major version is 0, "
+      "minor version must also be 0");
   Diags.Clear();
   DiagConsumer->clear();
 
@@ -173,8 +174,8 @@ TEST(DxcModeTest, ValidatorVersionValidation) {
       TC.TranslateArgs(*DAL, "0", Action::OffloadKind::OFK_None));
   EXPECT_EQ(Diags.getNumErrors(), 2u);
   EXPECT_STREQ(DiagConsumer->Errors.back().c_str(),
-               "invalid validator version : 1\nFormat of validator version is "
-               "\"<major>.<minor>\" (ex:\"1.4\").");
+               "invalid validator version : 1; format of validator version is "
+               "\"<major>.<minor>\" (ex:\"1.4\")");
   Diags.Clear();
   DiagConsumer->clear();
 
@@ -190,8 +191,8 @@ TEST(DxcModeTest, ValidatorVersionValidation) {
   EXPECT_EQ(Diags.getNumErrors(), 3u);
   EXPECT_STREQ(
       DiagConsumer->Errors.back().c_str(),
-      "invalid validator version : -Tlib_6_7\nFormat of validator version is "
-      "\"<major>.<minor>\" (ex:\"1.4\").");
+      "invalid validator version : -Tlib_6_7; format of validator version is "
+      "\"<major>.<minor>\" (ex:\"1.4\")");
   Diags.Clear();
   DiagConsumer->clear();
 
@@ -207,8 +208,8 @@ TEST(DxcModeTest, ValidatorVersionValidation) {
   EXPECT_EQ(Diags.getNumErrors(), 4u);
   EXPECT_STREQ(
       DiagConsumer->Errors.back().c_str(),
-      "invalid validator version : foo\nFormat of validator version is "
-      "\"<major>.<minor>\" (ex:\"1.4\").");
+      "invalid validator version : foo; format of validator version is "
+      "\"<major>.<minor>\" (ex:\"1.4\")");
   Diags.Clear();
   DiagConsumer->clear();
 }
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 2f0c0f026677..a9df994189f0 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -17340,12 +17340,14 @@ TEST_F(FormatTest, ConfigurableSpaceBeforeAssignmentOperators) {
   verifyFormat("int a = 5;");
   verifyFormat("a += 42;");
   verifyFormat("a or_eq 8;");
+  verifyFormat("xor = foo;");
 
   FormatStyle Spaces = getLLVMStyle();
   Spaces.SpaceBeforeAssignmentOperators = false;
   verifyFormat("int a= 5;", Spaces);
   verifyFormat("a+= 42;", Spaces);
   verifyFormat("a or_eq 8;", Spaces);
+  verifyFormat("xor= foo;", Spaces);
 }
 
 TEST_F(FormatTest, ConfigurableSpaceBeforeColon) {
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 45c1554308c9..6ea9c4a241dc 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -3015,6 +3015,60 @@ TEST_F(TokenAnnotatorTest, SwitchExpression) {
   EXPECT_TOKEN(Tokens[20], tok::arrow, TT_CaseLabelArrow);
 }
 
+TEST_F(TokenAnnotatorTest, CppAltOperatorKeywords) {
+  auto Tokens = annotate("a = b and c;");
+  ASSERT_EQ(Tokens.size(), 7u);
+  EXPECT_TOKEN(Tokens[3], tok::ampamp, TT_BinaryOperator);
+
+  Tokens = annotate("a = b and_eq c;");
+  ASSERT_EQ(Tokens.size(), 7u);
+  EXPECT_TOKEN(Tokens[3], tok::ampequal, TT_BinaryOperator);
+
+  Tokens = annotate("a = b bitand c;");
+  ASSERT_EQ(Tokens.size(), 7u);
+  EXPECT_TOKEN(Tokens[3], tok::amp, TT_BinaryOperator);
+
+  Tokens = annotate("a = b bitor c;");
+  ASSERT_EQ(Tokens.size(), 7u);
+  EXPECT_TOKEN(Tokens[3], tok::pipe, TT_BinaryOperator);
+
+  Tokens = annotate("a = b compl c;");
+  ASSERT_EQ(Tokens.size(), 7u);
+  EXPECT_TOKEN(Tokens[3], tok::tilde, TT_UnaryOperator);
+
+  Tokens = annotate("a = b not c;");
+  ASSERT_EQ(Tokens.size(), 7u);
+  EXPECT_TOKEN(Tokens[3], tok::exclaim, TT_UnaryOperator);
+
+  Tokens = annotate("a = b not_eq c;");
+  ASSERT_EQ(Tokens.size(), 7u);
+  EXPECT_TOKEN(Tokens[3], tok::exclaimequal, TT_BinaryOperator);
+
+  Tokens = annotate("a = b or c;");
+  ASSERT_EQ(Tokens.size(), 7u);
+  EXPECT_TOKEN(Tokens[3], tok::pipepipe, TT_BinaryOperator);
+
+  Tokens = annotate("a = b or_eq c;");
+  ASSERT_EQ(Tokens.size(), 7u);
+  EXPECT_TOKEN(Tokens[3], tok::pipeequal, TT_BinaryOperator);
+
+  Tokens = annotate("a = b xor c;");
+  ASSERT_EQ(Tokens.size(), 7u);
+  EXPECT_TOKEN(Tokens[3], tok::caret, TT_BinaryOperator);
+
+  Tokens = annotate("a = b xor_eq c;");
+  ASSERT_EQ(Tokens.size(), 7u);
+  EXPECT_TOKEN(Tokens[3], tok::caretequal, TT_BinaryOperator);
+
+  Tokens = annotate("xor = foo;");
+  ASSERT_EQ(Tokens.size(), 5u);
+  EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
+
+  Tokens = annotate("int xor = foo;");
+  ASSERT_EQ(Tokens.size(), 6u);
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_StartOfName);
+}
+
 } // namespace
 } // namespace format
 } // namespace clang
diff --git a/clang/unittests/Interpreter/CMakeLists.txt b/clang/unittests/Interpreter/CMakeLists.txt
index e5a77e77de75..c0fd2d8f3777 100644
--- a/clang/unittests/Interpreter/CMakeLists.txt
+++ b/clang/unittests/Interpreter/CMakeLists.txt
@@ -29,3 +29,46 @@ if(NOT WIN32)
 endif()
 
 export_executable_symbols(ClangReplInterpreterTests)
+
+if(MSVC)
+  set_target_properties(ClangReplInterpreterTests PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS 1)
+
+  # RTTI/C++ symbols
+  set(ClangReplInterpreterTests_exports ${ClangReplInterpreterTests_exports} ??_7type_info@@6B@
+    ?__type_info_root_node@@3U__type_info_node@@A
+    ?nothrow@std@@3Unothrow_t@1@B
+  )
+
+  # Compiler added symbols for static variables. NOT for VStudio < 2015
+  set(ClangReplInterpreterTests_exports ${ClangReplInterpreterTests_exports} _Init_thread_abort _Init_thread_epoch
+    _Init_thread_footer _Init_thread_header _tls_index
+  )
+
+  if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+    # new/delete variants needed when linking to static msvc runtime (esp. Debug)
+    set(ClangReplInterpreterTests_exports ${ClangReplInterpreterTests_exports}
+      ??2@YAPEAX_K@Z
+      ??3@YAXPEAX@Z
+      ??_U@YAPEAX_K@Z
+      ??_V@YAXPEAX@Z
+      ??3@YAXPEAX_K@Z
+    )
+  else()
+    set(ClangReplInterpreterTests_exports ${ClangReplInterpreterTests_exports}
+      ??2@YAPAXI@Z
+      ??3@YAXPAX@Z
+      ??3@YAXPAXI@Z
+      ??_U@YAPAXI@Z
+      ??_V@YAXPAX@Z
+      ??_V@YAXPAXI@Z
+    )
+  endif()
+
+  # List to '/EXPORT:sym0 /EXPORT:sym1 /EXPORT:sym2 ...'
+  foreach(sym ${ClangReplInterpreterTests_exports})
+    set(ClangReplInterpreterTests_link_str "${ClangReplInterpreterTests_link_str} /EXPORT:${sym}")
+  endforeach(sym ${ClangReplInterpreterTests_exports})
+
+  set_property(TARGET ClangReplInterpreterTests APPEND_STRING PROPERTY LINK_FLAGS ${ClangReplInterpreterTests_link_str})
+
+endif(MSVC)
diff --git a/clang/unittests/StaticAnalyzer/CallEventTest.cpp b/clang/unittests/StaticAnalyzer/CallEventTest.cpp
index adbfe02a284d..7c4132788ca7 100644
--- a/clang/unittests/StaticAnalyzer/CallEventTest.cpp
+++ b/clang/unittests/StaticAnalyzer/CallEventTest.cpp
@@ -76,7 +76,7 @@ TEST(CXXDeallocatorCall, SimpleDestructor) {
     }
   )",
                                                          Diags));
-  EXPECT_EQ(Diags, "test.CXXDeallocator: NumArgs: 1\n");
+  EXPECT_EQ(Diags, "test.CXXDeallocator: NumArgs: 2\n");
 }
 
 } // namespace
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index 3ddfd3277b68..e77d80623e84 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -1385,17 +1385,14 @@ void SVEEmitter::createHeader(raw_ostream &OS) {
         SVEType ToV(To.BaseType, N);
         for (const ReinterpretTypeInfo &From : Reinterprets) {
           SVEType FromV(From.BaseType, N);
-          if (ShortForm) {
-            OS << "__aio __attribute__((target(\"sve\"))) " << ToV.str()
-               << " svreinterpret_" << To.Suffix;
-            OS << "(" << FromV.str() << " op) __arm_streaming_compatible {\n";
-            OS << "  return __builtin_sve_reinterpret_" << To.Suffix << "_"
-               << From.Suffix << Suffix << "(op);\n";
-            OS << "}\n\n";
-          } else
-            OS << "#define svreinterpret_" << To.Suffix << "_" << From.Suffix
-               << Suffix << "(...) __builtin_sve_reinterpret_" << To.Suffix
-               << "_" << From.Suffix << Suffix << "(__VA_ARGS__)\n";
+          OS << "__aio "
+                "__attribute__((__clang_arm_builtin_alias(__builtin_sve_"
+                "reinterpret_"
+             << To.Suffix << "_" << From.Suffix << Suffix << ")))\n"
+             << ToV.str() << " svreinterpret_" << To.Suffix;
+          if (!ShortForm)
+            OS << "_" << From.Suffix << Suffix;
+          OS << "(" << FromV.str() << " op);\n";
         }
       }
   }
@@ -1453,7 +1450,7 @@ void SVEEmitter::createBuiltins(raw_ostream &OS) {
         SVEType FromV(From.BaseType, N);
         OS << "TARGET_BUILTIN(__builtin_sve_reinterpret_" << To.Suffix << "_"
            << From.Suffix << Suffix << +", \"" << ToV.builtin_str()
-           << FromV.builtin_str() << "\", \"n\", \"sve\")\n";
+           << FromV.builtin_str() << "\", \"n\", \"sme|sve\")\n";
       }
     }
   }
diff --git a/clang/utils/analyzer/entrypoint.py b/clang/utils/analyzer/entrypoint.py
index ff877060bad6..4deb42db0a0b 100644
--- a/clang/utils/analyzer/entrypoint.py
+++ b/clang/utils/analyzer/entrypoint.py
@@ -54,7 +54,7 @@ CMAKE_COMMAND = (
     "cmake -G Ninja -DCMAKE_BUILD_TYPE=Release "
     "-DCMAKE_INSTALL_PREFIX=/analyzer -DLLVM_TARGETS_TO_BUILD=X86 "
     '-DLLVM_ENABLE_PROJECTS="clang;openmp" -DLLVM_BUILD_RUNTIME=OFF '
-    "-DLLVM_ENABLE_TERMINFO=OFF -DCLANG_ENABLE_ARCMT=OFF "
+    "-DCLANG_ENABLE_ARCMT=OFF "
     "-DCLANG_ENABLE_STATIC_ANALYZER=ON"
 )
 
diff --git a/clang/utils/ci/buildkite-pipeline.yml b/clang/utils/ci/buildkite-pipeline.yml
index 7a679176038c..86cfcf35cc86 100644
--- a/clang/utils/ci/buildkite-pipeline.yml
+++ b/clang/utils/ci/buildkite-pipeline.yml
@@ -17,18 +17,7 @@ env:
     # LLVM RELEASE bump version
     LLVM_HEAD_VERSION: "17"
 steps:
-  - label: "Format"
-    commands:
-      - "clang/utils/ci/run-buildbot check-format"
-    agents:
-      queue: "linux"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 2
-    timeout_in_minutes: 120
-
-  - label: "Building and testing clang (Linux)"
+  - label: "Building Clang (Linux)"
     commands:
       - "clang/utils/ci/run-buildbot build-clang"
     agents:
@@ -39,21 +28,9 @@ steps:
           limit: 2
     timeout_in_minutes: 120
 
-  - label: "Building and testing clang (Windows)"
-    commands:
-      - "C:\\BuildTools\\Common7\\Tools\\VsDevCmd.bat -arch=amd64 -host_arch=amd64"
-      - "bash clang/utils/ci/run-buildbot build-clang-windows"
-    agents:
-      queue: "windows"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 2
-    timeout_in_minutes: 120
-
   - wait
 
-  - label: "Running libc++ test suite in C++03"
+  - label: "Testing libc++ with just-built Clang (C++03)"
     commands:
       - "clang/utils/ci/run-buildbot generic-cxx03"
     artifact_paths:
@@ -70,7 +47,7 @@ steps:
           limit: 2
     timeout_in_minutes: 120
 
-  - label: "Running libc++ test suite in C++26"
+  - label: "Testing libc++ with just-built Clang (C++26)"
     commands:
       - "clang/utils/ci/run-buildbot generic-cxx26"
     artifact_paths:
@@ -87,7 +64,7 @@ steps:
           limit: 2
     timeout_in_minutes: 120
 
-  - label: "Running libc++ test suite with Clang Modules"
+  - label: "Testing libc++ with just-built Clang (w/ Clang Modules)"
     commands:
       - "clang/utils/ci/run-buildbot generic-modules"
     artifact_paths:
diff --git a/clang/utils/ci/run-buildbot b/clang/utils/ci/run-buildbot
index f47ffb5cbd38..c68ddad571f3 100755
--- a/clang/utils/ci/run-buildbot
+++ b/clang/utils/ci/run-buildbot
@@ -69,13 +69,6 @@ cmake --version
 ninja --version
 
 case "${BUILDER}" in
-check-format)
-    echo "*** Checking for trailing whitespace left in Clang source files ***"
-    if grep -rnI '[[:blank:]]$' clang/lib clang/include clang/docs; then
-        echo "*** Trailing whitespace has been found in Clang source files as described above ***"
-        exit 1
-    fi
-;;
 build-clang)
     mkdir install
     # We use Release here to avoid including debug information. Otherwise, the
@@ -90,29 +83,13 @@ build-clang)
         -DCMAKE_CXX_COMPILER_LAUNCHER="ccache"                                 \
         -DCMAKE_BUILD_TYPE=Release                                             \
         -DCMAKE_INSTALL_PREFIX=install                                         \
+        -DLLVM_TARGETS_TO_BUILD=Native                                         \
         -DLLVM_ENABLE_PROJECTS="clang;compiler-rt"                             \
 
     ninja -C ${BUILD_DIR} install-clang install-clang-resource-headers
     ccache -s
     tar -cJvf install.tar.xz install/
     buildkite-agent artifact upload --debug install.tar.xz
-
-    ninja -C ${BUILD_DIR} check-clang
-;;
-build-clang-windows)
-    cmake -S llvm -B ${BUILD_DIR} -G Ninja                                      \
-        -D CMAKE_C_COMPILER_LAUNCHER=sccache                                    \
-        -D CMAKE_CXX_COMPILER_LAUNCHER=sccache                                  \
-        -D CMAKE_BUILD_TYPE=Release                                             \
-        -D CMAKE_INSTALL_PREFIX=install-windows                                 \
-        -D LLVM_ENABLE_PROJECTS="clang;compiler-rt"                             \
-        -D LLVM_ENABLE_ASSERTIONS=ON                                            \
-        -D LLVM_BUILD_EXAMPLES=ON                                               \
-        -D COMPILER_RT_BUILD_LIBFUZZER=OFF                                      \
-        -D COMPILER_RT_BUILD_ORC=OFF
-
-    ninja -C ${BUILD_DIR} install-clang install-clang-resource-headers
-    ninja -C ${BUILD_DIR} check-clang
 ;;
 generic-cxx03)
     buildkite-agent artifact download install.tar.xz .
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 9d458330f537..4cce88fe0490 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -10698,7 +10698,7 @@ and <I>POD class</I></td>
     <td><a href="https://cplusplus.github.io/CWG/issues/1815.html">1815</a></td>
     <td>CD4</td>
     <td>Lifetime extension in aggregate initialization</td>
-    <td class="none" align="center">No</td>
+    <td class="unreleased" align="center">Clang 19</td>
   </tr>
   <tr id="1816">
     <td><a href="https://cplusplus.github.io/CWG/issues/1816.html">1816</a></td>
@@ -17095,7 +17095,7 @@ objects</td>
     <td><a href="https://cplusplus.github.io/CWG/issues/2881.html">2881</a></td>
     <td>tentatively ready</td>
     <td>Type restrictions for the explicit object parameter of a lambda</td>
-    <td align="center">Not resolved</td>
+    <td title="Clang 19 implements 2024-04-19 resolution" align="center">Not Resolved*</td>
   </tr>
   <tr class="open" id="2882">
     <td><a href="https://cplusplus.github.io/CWG/issues/2882.html">2882</a></td>
diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html
index a11bf9a06f9f..45416170b16e 100755
--- a/clang/www/cxx_status.html
+++ b/clang/www/cxx_status.html
@@ -1255,12 +1255,11 @@ version.
 </table>
 
 <p>
-<span id="n3778">(7): In Clang 3.7 and later, sized deallocation is only enabled
-if the user passes the <code>-fsized-deallocation</code> flag. The user must
-supply definitions of the sized deallocation functions, either by providing them
-explicitly or by using a C++ standard library that does. <code>libstdc++</code>
-added these functions in version 5.0, and <code>libc++</code> added them in
-version 3.7.
+<span id="n3778">(7): The user must supply definitions of the sized deallocation
+  functions, either by providing them explicitly or by using a C++ standard library
+  that does. <code>libstdc++</code> added these functions in version 5.0, and
+  <code>libc++</code> added them in version 3.7. The user can also use the
+  <code>-fno-sized-deallocation</code> option to disable sized deallocation.
 </span>
 </p>
 </details>
diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index 42edbe15edaf..bddaa37579fd 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -182,21 +182,6 @@ check_library_exists(m pow "" COMPILER_RT_HAS_LIBM)
 check_library_exists(pthread pthread_create "" COMPILER_RT_HAS_LIBPTHREAD)
 check_library_exists(execinfo backtrace "" COMPILER_RT_HAS_LIBEXECINFO)
 
-# Look for terminfo library, used in unittests that depend on LLVMSupport.
-if(LLVM_ENABLE_TERMINFO STREQUAL FORCE_ON)
-  set(MAYBE_REQUIRED REQUIRED)
-else()
-  set(MAYBE_REQUIRED)
-endif()
-if(LLVM_ENABLE_TERMINFO)
-  find_library(COMPILER_RT_TERMINFO_LIB NAMES terminfo tinfo curses ncurses ncursesw ${MAYBE_REQUIRED})
-endif()
-if(COMPILER_RT_TERMINFO_LIB)
-  set(LLVM_ENABLE_TERMINFO 1)
-else()
-  set(LLVM_ENABLE_TERMINFO 0)
-endif()
-
 if (ANDROID AND COMPILER_RT_HAS_LIBDL)
   # Android's libstdc++ has a dependency on libdl.
   list(APPEND CMAKE_REQUIRED_LIBRARIES dl)
diff --git a/compiler-rt/lib/dfsan/dfsan_allocator.cpp b/compiler-rt/lib/dfsan/dfsan_allocator.cpp
index 63475f434cd1..682df8c6e034 100644
--- a/compiler-rt/lib/dfsan/dfsan_allocator.cpp
+++ b/compiler-rt/lib/dfsan/dfsan_allocator.cpp
@@ -45,7 +45,7 @@ const uptr kAllocatorSpace = 0xE00000000000ULL;
 #else
 const uptr kAllocatorSpace = 0x700000000000ULL;
 #endif
-const uptr kMaxAllowedMallocSize = 8UL << 30;
+const uptr kMaxAllowedMallocSize = 1ULL << 40;
 
 struct AP64 {  // Allocator64 parameters. Deliberately using a short name.
   static const uptr kSpaceBeg = kAllocatorSpace;
diff --git a/compiler-rt/lib/dfsan/dfsan_custom.cpp b/compiler-rt/lib/dfsan/dfsan_custom.cpp
index 3af26e9f64c9..af3c1f4d1673 100644
--- a/compiler-rt/lib/dfsan/dfsan_custom.cpp
+++ b/compiler-rt/lib/dfsan/dfsan_custom.cpp
@@ -1901,17 +1901,27 @@ SANITIZER_INTERFACE_ATTRIBUTE int __dfso_nanosleep(
   return __dfsw_nanosleep(req, rem, req_label, rem_label, ret_label);
 }
 
-static void clear_msghdr_labels(size_t bytes_written, struct msghdr *msg) {
+static void clear_msghdr_labels(size_t bytes_written, struct msghdr *msg,
+                                int flags) {
   dfsan_set_label(0, msg, sizeof(*msg));
   dfsan_set_label(0, msg->msg_name, msg->msg_namelen);
   dfsan_set_label(0, msg->msg_control, msg->msg_controllen);
-  for (size_t i = 0; bytes_written > 0; ++i) {
-    assert(i < msg->msg_iovlen);
+  for (size_t i = 0; i < msg->msg_iovlen; ++i) {
     struct iovec *iov = &msg->msg_iov[i];
-    size_t iov_written =
-        bytes_written < iov->iov_len ? bytes_written : iov->iov_len;
+    size_t iov_written = iov->iov_len;
+
+    // When MSG_TRUNC is not set, we want to avoid setting 0 label on bytes that
+    // may not have changed, using bytes_written to bound the 0 label write.
+    // When MSG_TRUNC flag is set, bytes_written may be larger than the buffer,
+    // and should not be used as a bound.
+    if (!(MSG_TRUNC & flags)) {
+      if (bytes_written < iov->iov_len) {
+        iov_written = bytes_written;
+      }
+      bytes_written -= iov_written;
+    }
+
     dfsan_set_label(0, iov->iov_base, iov_written);
-    bytes_written -= iov_written;
   }
 }
 
@@ -1923,7 +1933,7 @@ SANITIZER_INTERFACE_ATTRIBUTE int __dfsw_recvmmsg(
   int ret = recvmmsg(sockfd, msgvec, vlen, flags, timeout);
   for (int i = 0; i < ret; ++i) {
     dfsan_set_label(0, &msgvec[i].msg_len, sizeof(msgvec[i].msg_len));
-    clear_msghdr_labels(msgvec[i].msg_len, &msgvec[i].msg_hdr);
+    clear_msghdr_labels(msgvec[i].msg_len, &msgvec[i].msg_hdr, flags);
   }
   *ret_label = 0;
   return ret;
@@ -1947,7 +1957,7 @@ SANITIZER_INTERFACE_ATTRIBUTE ssize_t __dfsw_recvmsg(
     dfsan_label msg_label, dfsan_label flags_label, dfsan_label *ret_label) {
   ssize_t ret = recvmsg(sockfd, msg, flags);
   if (ret >= 0)
-    clear_msghdr_labels(ret, msg);
+    clear_msghdr_labels(ret, msg, flags);
   *ret_label = 0;
   return ret;
 }
diff --git a/compiler-rt/lib/lsan/lsan_allocator.cpp b/compiler-rt/lib/lsan/lsan_allocator.cpp
index 12d579a9385b..493bf5f9efc5 100644
--- a/compiler-rt/lib/lsan/lsan_allocator.cpp
+++ b/compiler-rt/lib/lsan/lsan_allocator.cpp
@@ -31,7 +31,7 @@ static const uptr kMaxAllowedMallocSize = 1ULL << 30;
 #elif defined(__mips64) || defined(__aarch64__)
 static const uptr kMaxAllowedMallocSize = 4ULL << 30;
 #else
-static const uptr kMaxAllowedMallocSize = 8ULL << 30;
+static const uptr kMaxAllowedMallocSize = 1ULL << 40;
 #endif
 
 static Allocator allocator;
diff --git a/compiler-rt/lib/msan/msan_allocator.cpp b/compiler-rt/lib/msan/msan_allocator.cpp
index b1bc5b9390f7..8350106dc817 100644
--- a/compiler-rt/lib/msan/msan_allocator.cpp
+++ b/compiler-rt/lib/msan/msan_allocator.cpp
@@ -71,7 +71,7 @@ static const uptr kAllocatorSpace = 0x700000000000ULL;
 #else
 static const uptr kAllocatorSpace = 0x600000000000ULL;
 #endif
-static const uptr kMaxAllowedMallocSize = 8UL << 30;
+static const uptr kMaxAllowedMallocSize = 1ULL << 40;
 
 struct AP64 {  // Allocator64 parameters. Deliberately using a short name.
   static const uptr kSpaceBeg = kAllocatorSpace;
diff --git a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
index 005bd6d584c5..b4702339db59 100755
--- a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
+++ b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
@@ -139,7 +139,6 @@ if [[ ! -f ${LLVM_BUILD}/build.ninja ]]; then
     -DLLVM_INCLUDE_TESTS=OFF \
     -DLLVM_ENABLE_ZLIB=ON \
     -DLLVM_ENABLE_ZSTD=OFF \
-    -DLLVM_ENABLE_TERMINFO=OFF \
     -DLLVM_ENABLE_THREADS=OFF \
   $LLVM_SRC
 fi
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index 15a199ae0349..f9ed36581f8d 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -1052,6 +1052,10 @@ private:
                                 void *Block, const uptr UserPtr,
                                 const uptr SizeOrUnusedBytes,
                                 const FillContentsMode FillContents) {
+    // Compute the default pointer before adding the header tag
+    const uptr DefaultAlignedPtr =
+        reinterpret_cast<uptr>(Block) + Chunk::getHeaderSize();
+
     Block = addHeaderTag(Block);
     // Only do content fill when it's from primary allocator because secondary
     // allocator has filled the content.
@@ -1064,8 +1068,6 @@ private:
 
     Chunk::UnpackedHeader Header = {};
 
-    const uptr DefaultAlignedPtr =
-        reinterpret_cast<uptr>(Block) + Chunk::getHeaderSize();
     if (UNLIKELY(DefaultAlignedPtr != UserPtr)) {
       const uptr Offset = UserPtr - DefaultAlignedPtr;
       DCHECK_GE(Offset, 2 * sizeof(u32));
@@ -1096,6 +1098,10 @@ private:
     const Options Options = Primary.Options.load();
     DCHECK(useMemoryTagging<AllocatorConfig>(Options));
 
+    // Compute the default pointer before adding the header tag
+    const uptr DefaultAlignedPtr =
+        reinterpret_cast<uptr>(Block) + Chunk::getHeaderSize();
+
     void *Ptr = reinterpret_cast<void *>(UserPtr);
     void *TaggedPtr = Ptr;
 
@@ -1194,8 +1200,6 @@ private:
 
     Chunk::UnpackedHeader Header = {};
 
-    const uptr DefaultAlignedPtr =
-        reinterpret_cast<uptr>(Block) + Chunk::getHeaderSize();
     if (UNLIKELY(DefaultAlignedPtr != UserPtr)) {
       const uptr Offset = UserPtr - DefaultAlignedPtr;
       DCHECK_GE(Offset, 2 * sizeof(u32));
diff --git a/compiler-rt/lib/xray/tests/CMakeLists.txt b/compiler-rt/lib/xray/tests/CMakeLists.txt
index 0a428b9a30b1..4c7e92b6ecc3 100644
--- a/compiler-rt/lib/xray/tests/CMakeLists.txt
+++ b/compiler-rt/lib/xray/tests/CMakeLists.txt
@@ -54,11 +54,6 @@ set(XRAY_UNITTEST_LINK_FLAGS
   ${COMPILER_RT_CXX_LINK_LIBS})
 
 if (NOT APPLE)
-  # Needed by LLVMSupport.
-  append_list_if(
-    LLVM_ENABLE_TERMINFO
-    -l${COMPILER_RT_TERMINFO_LIB} XRAY_UNITTEST_LINK_FLAGS)
-
   # We add the library directories one at a time in our CFLAGS.
   foreach (DIR ${LLVM_LIBRARY_DIR})
     list(APPEND XRAY_UNITTEST_LINK_FLAGS -L${DIR})
diff --git a/compiler-rt/test/asan/TestCases/Windows/bitfield_uaf.cpp b/compiler-rt/test/asan/TestCases/Windows/bitfield_uaf.cpp
index 12ed505883e2..ac3649a9e1bf 100644
--- a/compiler-rt/test/asan/TestCases/Windows/bitfield_uaf.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/bitfield_uaf.cpp
@@ -24,10 +24,10 @@ int main(void) {
   // CHECK: [[ADDR]] is located 0 bytes inside of 4-byte region
   // CHECK-LABEL: freed by thread T0 here:
   // CHECK:   {{#0 .* free }}
-  // CHECK:   {{ #[1-2] .* main .*bitfield_uaf.cpp}}:[[@LINE-4]]
+  // CHECK:   {{ #[1-3] .* main .*bitfield_uaf.cpp}}:[[@LINE-4]]
   // CHECK-LABEL: previously allocated by thread T0 here:
   // CHECK:   {{#0 .* malloc }}
-  // CHECK:   {{ #[1-2] .* main .*bitfield_uaf.cpp}}:[[@LINE-8]]
+  // CHECK:   {{ #[1-3] .* main .*bitfield_uaf.cpp}}:[[@LINE-8]]
   make_access(s);
   return 0;
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/calloc_left_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/calloc_left_oob.cpp
index e96fb6190f5a..e71ffdb9f241 100644
--- a/compiler-rt/test/asan/TestCases/Windows/calloc_left_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/calloc_left_oob.cpp
@@ -12,6 +12,6 @@ int main() {
   // CHECK: [[ADDR]] is located 4 bytes before 168-byte region
   // CHECK: allocated by thread T0 here:
   // CHECK: {{#0 .* calloc }}
-  // CHECK: {{ #[1-2] .* main .*calloc_left_oob.cpp}}:[[@LINE-8]]
+  // CHECK: {{ #[1-3] .* main .*calloc_left_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/calloc_right_oob.cpp b/compiler-rt/test/asan/TestCases/Windows/calloc_right_oob.cpp
index fe0fc20e1919..507d84483cca 100644
--- a/compiler-rt/test/asan/TestCases/Windows/calloc_right_oob.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/calloc_right_oob.cpp
@@ -12,6 +12,6 @@ int main() {
   // CHECK: [[ADDR]] is located 0 bytes after 168-byte region
   // CHECK: allocated by thread T0 here:
   // CHECK-NEXT: {{#0 .* calloc }}
-  // CHECK: {{ #[1-2] .* main .*calloc_right_oob.cpp}}:[[@LINE-8]]
+  // CHECK: {{ #[1-3] .* main .*calloc_right_oob.cpp}}:[[@LINE-8]]
   free(buffer);
 }
diff --git a/compiler-rt/test/asan/TestCases/Windows/calloc_uaf.cpp b/compiler-rt/test/asan/TestCases/Windows/calloc_uaf.cpp
index bf13f7d3eb66..a03c5e10a533 100644
--- a/compiler-rt/test/asan/TestCases/Windows/calloc_uaf.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/calloc_uaf.cpp
@@ -13,8 +13,8 @@ int main() {
   // CHECK: [[ADDR]] is located 0 bytes inside of 168-byte region
   // CHECK: freed by thread T0 here:
   // CHECK-NEXT: {{#0 .* free }}
-  // CHECK: {{ #[1-2] .* main .*calloc_uaf.cpp}}:[[@LINE-8]]
+  // CHECK: {{ #[1-3] .* main .*calloc_uaf.cpp}}:[[@LINE-8]]
   // CHECK: previously allocated by thread T0 here:
   // CHECK-NEXT: {{#0 .* calloc }}
-  // CHECK: {{ #[1-2] .* main .*calloc_uaf.cpp}}:[[@LINE-12]]
+  // CHECK: {{ #[1-3] .* main .*calloc_uaf.cpp}}:[[@LINE-12]]
 }
diff --git a/compiler-rt/test/dfsan/custom.cpp b/compiler-rt/test/dfsan/custom.cpp
index f544e481b726..cede0d64dbcf 100644
--- a/compiler-rt/test/dfsan/custom.cpp
+++ b/compiler-rt/test/dfsan/custom.cpp
@@ -768,26 +768,53 @@ void test_recvmsg() {
   ssize_t sent = sendmsg(sockfds[0], &smsg, 0);
   assert(sent > 0);
 
-  char rbuf[128];
-  struct iovec riovs[2] = {{&rbuf[0], 4}, {&rbuf[4], 4}};
-  struct msghdr rmsg = {};
-  rmsg.msg_iov = riovs;
-  rmsg.msg_iovlen = 2;
-
-  dfsan_set_label(i_label, rbuf, sizeof(rbuf));
-  dfsan_set_label(i_label, &rmsg, sizeof(rmsg));
-
-  DEFINE_AND_SAVE_ORIGINS(rmsg)
-
-  ssize_t received = recvmsg(sockfds[1], &rmsg, 0);
-  assert(received == sent);
-  assert(memcmp(sbuf, rbuf, 8) == 0);
-  ASSERT_ZERO_LABEL(received);
-  ASSERT_READ_ZERO_LABEL(&rmsg, sizeof(rmsg));
-  ASSERT_READ_ZERO_LABEL(&rbuf[0], 8);
-  ASSERT_READ_LABEL(&rbuf[8], 1, i_label);
-
-  ASSERT_SAVED_ORIGINS(rmsg)
+  {
+    char rpbuf[2];
+    struct iovec peek_iov;
+    peek_iov.iov_base = rpbuf;
+    peek_iov.iov_len = 2;
+
+    struct msghdr peek_header = {};
+    peek_header.msg_iov = &peek_iov;
+    peek_header.msg_iovlen = 1;
+
+    dfsan_set_label(i_label, rpbuf, sizeof(rpbuf));
+    dfsan_set_label(i_label, &peek_header, sizeof(peek_header));
+
+    DEFINE_AND_SAVE_ORIGINS(peek_header)
+
+    ssize_t received = recvmsg(sockfds[1], &peek_header, MSG_PEEK | MSG_TRUNC);
+    assert(received == sent);
+    assert(memcmp(sbuf, rpbuf, 2) == 0);
+    ASSERT_ZERO_LABEL(received);
+    ASSERT_READ_ZERO_LABEL(&peek_header, sizeof(peek_header));
+    ASSERT_READ_ZERO_LABEL(&rpbuf[0], 0);
+
+    ASSERT_SAVED_ORIGINS(peek_header)
+  }
+
+  {
+    char rbuf[128];
+    struct iovec riovs[2] = {{&rbuf[0], 4}, {&rbuf[4], 4}};
+    struct msghdr rmsg = {};
+    rmsg.msg_iov = riovs;
+    rmsg.msg_iovlen = 2;
+
+    dfsan_set_label(i_label, rbuf, sizeof(rbuf));
+    dfsan_set_label(i_label, &rmsg, sizeof(rmsg));
+
+    DEFINE_AND_SAVE_ORIGINS(rmsg)
+
+    ssize_t received = recvmsg(sockfds[1], &rmsg, 0);
+    assert(received == sent);
+    assert(memcmp(sbuf, rbuf, 8) == 0);
+    ASSERT_ZERO_LABEL(received);
+    ASSERT_READ_ZERO_LABEL(&rmsg, sizeof(rmsg));
+    ASSERT_READ_ZERO_LABEL(&rbuf[0], 8);
+    ASSERT_READ_LABEL(&rbuf[8], 1, i_label);
+
+    ASSERT_SAVED_ORIGINS(rmsg)
+  }
 
   close(sockfds[0]);
   close(sockfds[1]);
diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index c8e75024823f..af34366b3652 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -336,7 +336,7 @@ endif()
 
 if (FLANG_RUNTIME_F128_MATH_LIB)
   add_compile_definitions(
-    -DFLANG_RUNTIME_F128_MATH_LIB="${FLANG_RUNTIME_F128_MATH_LIB}"
+    FLANG_RUNTIME_F128_MATH_LIB="${FLANG_RUNTIME_F128_MATH_LIB}"
     )
 endif()
 
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 43ed35e36a6e..7b872c786c82 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -223,6 +223,10 @@ end
 * When a dummy argument is `POINTER` or `ALLOCATABLE` and is `INTENT(IN)`, we
   relax enforcement of some requirements on actual arguments that must otherwise
   hold true for definable arguments.
+* We allow a limited polymorphic `POINTER` or `ALLOCATABLE` actual argument
+  to be associated with a compatible monomorphic dummy argument, as
+  our implementation, like others, supports a reallocation that would
+  change the dynamic type
 * Assignment of `LOGICAL` to `INTEGER` and vice versa (but not other types) is
   allowed.  The values are normalized to canonical `.TRUE.`/`.FALSE.`.
   The values are also normalized for assignments of `LOGICAL(KIND=K1)` to
diff --git a/flang/include/flang/Common/Fortran-features.h b/flang/include/flang/Common/Fortran-features.h
index f57fcdc895ad..15c4af63f4be 100644
--- a/flang/include/flang/Common/Fortran-features.h
+++ b/flang/include/flang/Common/Fortran-features.h
@@ -49,7 +49,8 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines,
     IndistinguishableSpecifics, SubroutineAndFunctionSpecifics,
     EmptySequenceType, NonSequenceCrayPointee, BranchIntoConstruct,
     BadBranchTarget, ConvertedArgument, HollerithPolymorphic, ListDirectedSize,
-    NonBindCInteroperability, CudaManaged, CudaUnified)
+    NonBindCInteroperability, CudaManaged, CudaUnified,
+    PolymorphicActualAllocatableOrPointerToMonomorphicDummy)
 
 // Portability and suspicious usage warnings
 ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
diff --git a/flang/include/flang/Common/api-attrs.h b/flang/include/flang/Common/api-attrs.h
index 04ee307326ac..d73e60996bc8 100644
--- a/flang/include/flang/Common/api-attrs.h
+++ b/flang/include/flang/Common/api-attrs.h
@@ -156,4 +156,26 @@
 #define RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
 #endif /* !defined(__CUDACC__) */
 
+/*
+ * RT_DEVICE_NOINLINE may be used for non-performance critical
+ * functions that should not be inlined to minimize the amount
+ * of code that needs to be processed by the device compiler's
+ * optimizer.
+ */
+#ifndef __has_attribute
+#define __has_attribute(x) 0
+#endif
+#if __has_attribute(noinline)
+#define RT_NOINLINE_ATTR __attribute__((noinline))
+#else
+#define RT_NOINLINE_ATTR
+#endif
+#if (defined(__CUDACC__) || defined(__CUDA__)) && defined(__CUDA_ARCH__)
+#define RT_DEVICE_NOINLINE RT_NOINLINE_ATTR
+#define RT_DEVICE_NOINLINE_HOST_INLINE
+#else
+#define RT_DEVICE_NOINLINE
+#define RT_DEVICE_NOINLINE_HOST_INLINE inline
+#endif
+
 #endif /* !FORTRAN_RUNTIME_API_ATTRS_H_ */
diff --git a/flang/include/flang/Common/visit.h b/flang/include/flang/Common/visit.h
index d867338be7e0..ad66297650b0 100644
--- a/flang/include/flang/Common/visit.h
+++ b/flang/include/flang/Common/visit.h
@@ -30,7 +30,7 @@ namespace log2visit {
 
 template <std::size_t LOW, std::size_t HIGH, typename RESULT, typename VISITOR,
     typename... VARIANT>
-inline RT_API_ATTRS RESULT Log2VisitHelper(
+RT_DEVICE_NOINLINE_HOST_INLINE RT_API_ATTRS RESULT Log2VisitHelper(
     VISITOR &&visitor, std::size_t which, VARIANT &&...u) {
   if constexpr (LOW + 7 >= HIGH) {
     switch (which - LOW) {
@@ -68,8 +68,9 @@ inline RT_API_ATTRS RESULT Log2VisitHelper(
 }
 
 template <typename VISITOR, typename... VARIANT>
-inline RT_API_ATTRS auto visit(VISITOR &&visitor, VARIANT &&...u)
-    -> decltype(visitor(std::get<0>(std::forward<VARIANT>(u))...)) {
+RT_DEVICE_NOINLINE_HOST_INLINE RT_API_ATTRS auto
+visit(VISITOR &&visitor, VARIANT &&...u) -> decltype(visitor(std::get<0>(
+                                             std::forward<VARIANT>(u))...)) {
   using Result = decltype(visitor(std::get<0>(std::forward<VARIANT>(u))...));
   if constexpr (sizeof...(u) == 1) {
     static constexpr std::size_t high{
diff --git a/flang/include/flang/Evaluate/characteristics.h b/flang/include/flang/Evaluate/characteristics.h
index 8aa065b025a4..9695c665d0cb 100644
--- a/flang/include/flang/Evaluate/characteristics.h
+++ b/flang/include/flang/Evaluate/characteristics.h
@@ -386,7 +386,7 @@ struct Procedure {
   bool HasExplicitInterface() const {
     return !attrs.test(Attr::ImplicitInterface);
   }
-  int FindPassIndex(std::optional<parser::CharBlock>) const;
+  std::optional<int> FindPassIndex(std::optional<parser::CharBlock>) const;
   bool CanBeCalledViaImplicitInterface(std::string *whyNot = nullptr) const;
   bool CanOverride(const Procedure &, std::optional<int> passIndex) const;
   bool IsCompatibleWith(const Procedure &, bool ignoreImplicitVsExplicit,
diff --git a/flang/include/flang/Evaluate/constant.h b/flang/include/flang/Evaluate/constant.h
index 71be7906d2fe..d9866a08889f 100644
--- a/flang/include/flang/Evaluate/constant.h
+++ b/flang/include/flang/Evaluate/constant.h
@@ -126,8 +126,7 @@ public:
   constexpr Result result() const { return result_; }
 
   constexpr DynamicType GetType() const { return result_.GetType(); }
-  llvm::raw_ostream &AsFortran(llvm::raw_ostream &,
-      const parser::CharBlock *derivedTypeRename = nullptr) const;
+  llvm::raw_ostream &AsFortran(llvm::raw_ostream &) const;
 
 protected:
   std::vector<Element> Reshape(const ConstantSubscripts &) const;
diff --git a/flang/include/flang/Evaluate/expression.h b/flang/include/flang/Evaluate/expression.h
index 64db0b88d03e..642ddf511684 100644
--- a/flang/include/flang/Evaluate/expression.h
+++ b/flang/include/flang/Evaluate/expression.h
@@ -735,8 +735,7 @@ public:
   StructureConstructor &Add(const semantics::Symbol &, Expr<SomeType> &&);
   int Rank() const { return 0; }
   DynamicType GetType() const;
-  llvm::raw_ostream &AsFortran(llvm::raw_ostream &,
-      const parser::CharBlock *derivedTypeRename = nullptr) const;
+  llvm::raw_ostream &AsFortran(llvm::raw_ostream &) const;
 
 private:
   std::optional<Expr<SomeType>> CreateParentComponent(const Symbol &) const;
diff --git a/flang/include/flang/Evaluate/type.h b/flang/include/flang/Evaluate/type.h
index 93a0f21fa914..de19e3d04dea 100644
--- a/flang/include/flang/Evaluate/type.h
+++ b/flang/include/flang/Evaluate/type.h
@@ -272,9 +272,6 @@ const semantics::DerivedTypeSpec *GetDerivedTypeSpec(
 const semantics::DerivedTypeSpec *GetParentTypeSpec(
     const semantics::DerivedTypeSpec &);
 
-std::string DerivedTypeSpecAsFortran(const semantics::DerivedTypeSpec &,
-    const parser::CharBlock *derivedTypeRename = nullptr);
-
 template <TypeCategory CATEGORY, int KIND = 0> struct TypeBase {
   static constexpr TypeCategory category{CATEGORY};
   static constexpr int kind{KIND};
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 977a69af5281..357df3b6df50 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -333,7 +333,10 @@ struct IntrinsicLibrary {
                                    llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genScale(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genScan(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
+  fir::ExtendedValue genSelectedCharKind(mlir::Type,
+                                         llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genSelectedIntKind(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genSelectedLogicalKind(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genSelectedRealKind(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genSetExponent(mlir::Type resultType,
                              llvm::ArrayRef<mlir::Value> args);
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Numeric.h b/flang/include/flang/Optimizer/Builder/Runtime/Numeric.h
index fec8c9906eff..558358257b51 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/Numeric.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Numeric.h
@@ -46,10 +46,18 @@ mlir::Value genRRSpacing(fir::FirOpBuilder &builder, mlir::Location loc,
 mlir::Value genScale(fir::FirOpBuilder &builder, mlir::Location loc,
                      mlir::Value x, mlir::Value i);
 
+/// Generate call to Selected_char_kind intrinsic runtime routine.
+mlir::Value genSelectedCharKind(fir::FirOpBuilder &builder, mlir::Location loc,
+                                mlir::Value name, mlir::Value length);
+
 /// Generate call to Selected_int_kind intrinsic runtime routine.
 mlir::Value genSelectedIntKind(fir::FirOpBuilder &builder, mlir::Location loc,
                                mlir::Value x);
 
+/// Generate call to Selected_logical_kind intrinsic runtime routine.
+mlir::Value genSelectedLogicalKind(fir::FirOpBuilder &builder,
+                                   mlir::Location loc, mlir::Value x);
+
 /// Generate call to Selected_real_kind intrinsic runtime routine.
 mlir::Value genSelectedRealKind(fir::FirOpBuilder &builder, mlir::Location loc,
                                 mlir::Value precision, mlir::Value range,
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index 72157bce4f76..37b8da018195 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -152,15 +152,21 @@ def cuf_DataTransferOp : cuf_Op<"data_transfer", []> {
       a = adev ! transfer device to host
       bdev = adev ! transfer device to device
     ```
+    
+    When the data transfer is done on data hold by descriptors, the LHS data
+    hold by the descriptor are updated. When required, the LHS decriptor is also
+    updated.
   }];
 
-  let arguments = (ins Arg<AnyReferenceLike, "", [MemWrite]>:$src,
-                       Arg<AnyReferenceLike, "", [MemRead]>:$dst,
+  let arguments = (ins Arg<AnyType, "", [MemRead]>:$src,
+                       Arg<AnyRefOrBoxType, "", [MemWrite]>:$dst,
                        cuf_DataTransferKindAttr:$transfer_kind);
 
   let assemblyFormat = [{
     $src `to` $dst attr-dict `:` type(operands)
   }];
+
+  let hasVerifier = 1;
 }
 
 def cuf_KernelLaunchOp : cuf_Op<"kernel_launch", [CallOpInterface,
diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.h b/flang/include/flang/Optimizer/HLFIR/Passes.h
index f0736c782b6c..4fa619cd53ca 100644
--- a/flang/include/flang/Optimizer/HLFIR/Passes.h
+++ b/flang/include/flang/Optimizer/HLFIR/Passes.h
@@ -24,9 +24,6 @@ namespace hlfir {
 
 std::unique_ptr<mlir::Pass> createConvertHLFIRtoFIRPass();
 std::unique_ptr<mlir::Pass> createBufferizeHLFIRPass();
-std::unique_ptr<mlir::Pass> createLowerHLFIRIntrinsicsPass();
-std::unique_ptr<mlir::Pass> createLowerHLFIROrderedAssignmentsPass();
-std::unique_ptr<mlir::Pass> createOptimizedBufferizationPass();
 
 #define GEN_PASS_REGISTRATION
 #include "flang/Optimizer/HLFIR/Passes.h.inc"
diff --git a/flang/include/flang/Optimizer/HLFIR/Passes.td b/flang/include/flang/Optimizer/HLFIR/Passes.td
index 0d4496a44c20..fc3d2a0d4681 100644
--- a/flang/include/flang/Optimizer/HLFIR/Passes.td
+++ b/flang/include/flang/Optimizer/HLFIR/Passes.td
@@ -23,19 +23,16 @@ def BufferizeHLFIR : Pass<"bufferize-hlfir", "::mlir::ModuleOp"> {
   let constructor = "hlfir::createBufferizeHLFIRPass()";
 }
 
-def OptimizedBufferization : Pass<"opt-bufferization", "::mlir::func::FuncOp"> {
+def OptimizedBufferization : Pass<"opt-bufferization"> {
   let summary = "Special cases for hlfir.expr bufferization where we can avoid a temporary which would be created by the generic bufferization pass";
-  let constructor = "hlfir::createOptimizedBufferizationPass()";
 }
 
 def LowerHLFIRIntrinsics : Pass<"lower-hlfir-intrinsics", "::mlir::ModuleOp"> {
   let summary = "Lower HLFIR transformational intrinsic operations";
-  let constructor = "hlfir::createLowerHLFIRIntrinsicsPass()";
 }
 
 def LowerHLFIROrderedAssignments : Pass<"lower-hlfir-ordered-assignments", "::mlir::ModuleOp"> {
   let summary = "Lower HLFIR ordered assignments like forall and where operations";
-  let constructor = "hlfir::createLowerHLFIROrderedAssignmentsPass()";
   let options = [
     Option<"tryFusingAssignments", "fuse-assignments",
            "bool", /*default=*/"false",
diff --git a/flang/include/flang/Semantics/scope.h b/flang/include/flang/Semantics/scope.h
index 21072772d184..a58163f5460c 100644
--- a/flang/include/flang/Semantics/scope.h
+++ b/flang/include/flang/Semantics/scope.h
@@ -225,6 +225,7 @@ public:
   ImportKind GetImportKind() const;
   // Names appearing in IMPORT statements in this scope
   std::set<SourceName> importNames() const { return importNames_; }
+  bool CanImport(const SourceName &) const;
 
   // Set the kind of imports from host into this scope.
   // Return an error message for incompatible kinds.
@@ -298,7 +299,6 @@ private:
   // or Symbol& points to one in there.
   static Symbols<1024> allSymbols;
 
-  bool CanImport(const SourceName &) const;
   const DeclTypeSpec &MakeLengthlessType(DeclTypeSpec &&);
 
   friend llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Scope &);
diff --git a/flang/include/flang/Semantics/semantics.h b/flang/include/flang/Semantics/semantics.h
index 167e61381639..d382663762bc 100644
--- a/flang/include/flang/Semantics/semantics.h
+++ b/flang/include/flang/Semantics/semantics.h
@@ -110,6 +110,9 @@ public:
   evaluate::FoldingContext &foldingContext() { return foldingContext_; }
   parser::AllCookedSources &allCookedSources() { return allCookedSources_; }
   ModuleDependences &moduleDependences() { return moduleDependences_; }
+  std::map<const Symbol *, SourceName> &moduleFileOutputRenamings() {
+    return moduleFileOutputRenamings_;
+  }
 
   SemanticsContext &set_location(
       const std::optional<parser::CharBlock> &location) {
@@ -299,6 +302,7 @@ private:
   std::list<parser::Program> modFileParseTrees_;
   std::unique_ptr<CommonBlockMap> commonBlockMap_;
   ModuleDependences moduleDependences_;
+  std::map<const Symbol *, SourceName> moduleFileOutputRenamings_;
 };
 
 class Semantics {
diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
index 50f7b68d80cb..f130036d949d 100644
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -815,6 +815,7 @@ public:
   void SetIsExplicitBindName(bool);
   bool IsFuncResult() const;
   bool IsObjectArray() const;
+  const ArraySpec *GetShape() const;
   bool IsSubprogram() const;
   bool IsFromModFile() const;
   bool HasExplicitInterface() const {
diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc
index 3900b172917e..56cc9da7de0d 100644
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -324,10 +324,11 @@ inline void createHLFIRToFIRPassPipeline(
   if (optLevel.isOptimizingForSpeed()) {
     addCanonicalizerPassWithoutRegionSimplification(pm);
     pm.addPass(mlir::createCSEPass());
-    pm.addPass(hlfir::createOptimizedBufferizationPass());
+    addNestedPassToAllTopLevelOperations(
+        pm, hlfir::createOptimizedBufferization);
   }
-  pm.addPass(hlfir::createLowerHLFIROrderedAssignmentsPass());
-  pm.addPass(hlfir::createLowerHLFIRIntrinsicsPass());
+  pm.addPass(hlfir::createLowerHLFIROrderedAssignments());
+  pm.addPass(hlfir::createLowerHLFIRIntrinsics());
   pm.addPass(hlfir::createBufferizeHLFIRPass());
   pm.addPass(hlfir::createConvertHLFIRtoFIRPass());
 }
diff --git a/flang/lib/Evaluate/characteristics.cpp b/flang/lib/Evaluate/characteristics.cpp
index ab03ca5ed2d5..a0ce190b90e9 100644
--- a/flang/lib/Evaluate/characteristics.cpp
+++ b/flang/lib/Evaluate/characteristics.cpp
@@ -1333,16 +1333,21 @@ bool Procedure::IsCompatibleWith(const Procedure &actual,
   return false;
 }
 
-int Procedure::FindPassIndex(std::optional<parser::CharBlock> name) const {
+std::optional<int> Procedure::FindPassIndex(
+    std::optional<parser::CharBlock> name) const {
   int argCount{static_cast<int>(dummyArguments.size())};
-  int index{0};
   if (name) {
-    while (index < argCount && *name != dummyArguments[index].name.c_str()) {
-      ++index;
+    for (int index{0}; index < argCount; ++index) {
+      if (*name == dummyArguments[index].name.c_str()) {
+        return index;
+      }
     }
+    return std::nullopt;
+  } else if (argCount > 0) {
+    return 0;
+  } else {
+    return std::nullopt;
   }
-  CHECK(index < argCount);
-  return index;
 }
 
 bool Procedure::CanOverride(
diff --git a/flang/lib/Evaluate/formatting.cpp b/flang/lib/Evaluate/formatting.cpp
index 20193b006bf2..0870d56549f7 100644
--- a/flang/lib/Evaluate/formatting.cpp
+++ b/flang/lib/Evaluate/formatting.cpp
@@ -14,6 +14,7 @@
 #include "flang/Evaluate/fold.h"
 #include "flang/Evaluate/tools.h"
 #include "flang/Parser/characters.h"
+#include "flang/Semantics/semantics.h"
 #include "flang/Semantics/symbol.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -53,7 +54,7 @@ static void ShapeAsFortran(llvm::raw_ostream &o,
 
 template <typename RESULT, typename VALUE>
 llvm::raw_ostream &ConstantBase<RESULT, VALUE>::AsFortran(
-    llvm::raw_ostream &o, const parser::CharBlock *derivedTypeRename) const {
+    llvm::raw_ostream &o) const {
   bool hasNonDefaultLowerBound{printLbounds && HasNonDefaultLowerBound()};
   if (Rank() > 1 || hasNonDefaultLowerBound) {
     o << "reshape(";
@@ -85,8 +86,7 @@ llvm::raw_ostream &ConstantBase<RESULT, VALUE>::AsFortran(
         o << ".false." << '_' << Result::kind;
       }
     } else {
-      StructureConstructor{result_.derivedTypeSpec(), value}.AsFortran(
-          o, derivedTypeRename);
+      StructureConstructor{result_.derivedTypeSpec(), value}.AsFortran(o);
     }
   }
   if (Rank() > 0) {
@@ -124,9 +124,89 @@ llvm::raw_ostream &Constant<Type<TypeCategory::Character, KIND>>::AsFortran(
   return o;
 }
 
+llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const Symbol &symbol,
+    std::optional<parser::CharBlock> name = std::nullopt) {
+  const auto &renamings{symbol.owner().context().moduleFileOutputRenamings()};
+  if (auto iter{renamings.find(&symbol)}; iter != renamings.end()) {
+    return o << iter->second.ToString();
+  } else if (name) {
+    return o << name->ToString();
+  } else {
+    return o << symbol.name().ToString();
+  }
+}
+
+llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const std::string &lit) {
+  return o << parser::QuoteCharacterLiteral(lit);
+}
+
+llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const std::u16string &lit) {
+  return o << parser::QuoteCharacterLiteral(lit);
+}
+
+llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const std::u32string &lit) {
+  return o << parser::QuoteCharacterLiteral(lit);
+}
+
+template <typename A>
+llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const A &x) {
+  return x.AsFortran(o);
+}
+
+template <typename A>
+llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, common::Reference<A> x) {
+  return EmitVar(o, *x);
+}
+
+template <typename A>
+llvm::raw_ostream &EmitVar(
+    llvm::raw_ostream &o, const A *p, const char *kw = nullptr) {
+  if (p) {
+    if (kw) {
+      o << kw;
+    }
+    EmitVar(o, *p);
+  }
+  return o;
+}
+
+template <typename A>
+llvm::raw_ostream &EmitVar(
+    llvm::raw_ostream &o, const std::optional<A> &x, const char *kw = nullptr) {
+  if (x) {
+    if (kw) {
+      o << kw;
+    }
+    EmitVar(o, *x);
+  }
+  return o;
+}
+
+template <typename A, bool COPY>
+llvm::raw_ostream &EmitVar(llvm::raw_ostream &o,
+    const common::Indirection<A, COPY> &p, const char *kw = nullptr) {
+  if (kw) {
+    o << kw;
+  }
+  EmitVar(o, p.value());
+  return o;
+}
+
+template <typename A>
+llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const std::shared_ptr<A> &p) {
+  CHECK(p);
+  return EmitVar(o, *p);
+}
+
+template <typename... A>
+llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const std::variant<A...> &u) {
+  common::visit([&](const auto &x) { EmitVar(o, x); }, u);
+  return o;
+}
+
 llvm::raw_ostream &ActualArgument::AssumedType::AsFortran(
     llvm::raw_ostream &o) const {
-  return o << symbol_->name().ToString();
+  return EmitVar(o, *symbol_);
 }
 
 llvm::raw_ostream &ActualArgument::AsFortran(llvm::raw_ostream &o) const {
@@ -504,15 +584,37 @@ llvm::raw_ostream &ExpressionBase<RESULT>::AsFortran(
   return o;
 }
 
-llvm::raw_ostream &StructureConstructor::AsFortran(
-    llvm::raw_ostream &o, const parser::CharBlock *derivedTypeRename) const {
-  o << DerivedTypeSpecAsFortran(result_.derivedTypeSpec(), derivedTypeRename);
+static std::string DerivedTypeSpecAsFortran(
+    const semantics::DerivedTypeSpec &spec) {
+  std::string buf;
+  llvm::raw_string_ostream ss{buf};
+  EmitVar(ss, spec.typeSymbol(), spec.name());
+  char ch{'('};
+  for (const auto &[name, value] : spec.parameters()) {
+    ss << ch << name.ToString() << '=';
+    ch = ',';
+    if (value.isAssumed()) {
+      ss << '*';
+    } else if (value.isDeferred()) {
+      ss << ':';
+    } else {
+      value.GetExplicit()->AsFortran(ss);
+    }
+  }
+  if (ch != '(') {
+    ss << ')';
+  }
+  return ss.str();
+}
+
+llvm::raw_ostream &StructureConstructor::AsFortran(llvm::raw_ostream &o) const {
+  o << DerivedTypeSpecAsFortran(result_.derivedTypeSpec());
   if (values_.empty()) {
     o << '(';
   } else {
     char ch{'('};
     for (const auto &[symbol, value] : values_) {
-      value.value().AsFortran(o << ch << symbol->name().ToString() << '=');
+      value.value().AsFortran(EmitVar(o << ch, *symbol) << '=');
       ch = ',';
     }
   }
@@ -568,101 +670,6 @@ std::string SomeDerived::AsFortran() const {
   }
 }
 
-std::string DerivedTypeSpecAsFortran(const semantics::DerivedTypeSpec &spec,
-    const parser::CharBlock *derivedTypeRename) {
-  std::string buf;
-  llvm::raw_string_ostream ss{buf};
-  ss << (derivedTypeRename ? *derivedTypeRename : spec.name()).ToString();
-  char ch{'('};
-  for (const auto &[name, value] : spec.parameters()) {
-    ss << ch << name.ToString() << '=';
-    ch = ',';
-    if (value.isAssumed()) {
-      ss << '*';
-    } else if (value.isDeferred()) {
-      ss << ':';
-    } else {
-      value.GetExplicit()->AsFortran(ss);
-    }
-  }
-  if (ch != '(') {
-    ss << ')';
-  }
-  return ss.str();
-}
-
-llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const Symbol &symbol) {
-  return o << symbol.name().ToString();
-}
-
-llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const std::string &lit) {
-  return o << parser::QuoteCharacterLiteral(lit);
-}
-
-llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const std::u16string &lit) {
-  return o << parser::QuoteCharacterLiteral(lit);
-}
-
-llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const std::u32string &lit) {
-  return o << parser::QuoteCharacterLiteral(lit);
-}
-
-template <typename A>
-llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const A &x) {
-  return x.AsFortran(o);
-}
-
-template <typename A>
-llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, common::Reference<A> x) {
-  return EmitVar(o, *x);
-}
-
-template <typename A>
-llvm::raw_ostream &EmitVar(
-    llvm::raw_ostream &o, const A *p, const char *kw = nullptr) {
-  if (p) {
-    if (kw) {
-      o << kw;
-    }
-    EmitVar(o, *p);
-  }
-  return o;
-}
-
-template <typename A>
-llvm::raw_ostream &EmitVar(
-    llvm::raw_ostream &o, const std::optional<A> &x, const char *kw = nullptr) {
-  if (x) {
-    if (kw) {
-      o << kw;
-    }
-    EmitVar(o, *x);
-  }
-  return o;
-}
-
-template <typename A, bool COPY>
-llvm::raw_ostream &EmitVar(llvm::raw_ostream &o,
-    const common::Indirection<A, COPY> &p, const char *kw = nullptr) {
-  if (kw) {
-    o << kw;
-  }
-  EmitVar(o, p.value());
-  return o;
-}
-
-template <typename A>
-llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const std::shared_ptr<A> &p) {
-  CHECK(p);
-  return EmitVar(o, *p);
-}
-
-template <typename... A>
-llvm::raw_ostream &EmitVar(llvm::raw_ostream &o, const std::variant<A...> &u) {
-  common::visit([&](const auto &x) { EmitVar(o, x); }, u);
-  return o;
-}
-
 llvm::raw_ostream &BaseObject::AsFortran(llvm::raw_ostream &o) const {
   return EmitVar(o, u);
 }
diff --git a/flang/lib/Evaluate/shape.cpp b/flang/lib/Evaluate/shape.cpp
index 6246cb931ff9..5cf48b240eca 100644
--- a/flang/lib/Evaluate/shape.cpp
+++ b/flang/lib/Evaluate/shape.cpp
@@ -885,8 +885,12 @@ auto GetShapeHelper::operator()(const ProcedureRef &call) const -> Result {
         intrinsic->name == "ubound") {
       // For LBOUND/UBOUND, these are the array-valued cases (no DIM=)
       if (!call.arguments().empty() && call.arguments().front()) {
-        return Shape{
-            MaybeExtentExpr{ExtentExpr{call.arguments().front()->Rank()}}};
+        if (IsAssumedRank(*call.arguments().front())) {
+          return Shape{MaybeExtentExpr{}};
+        } else {
+          return Shape{
+              MaybeExtentExpr{ExtentExpr{call.arguments().front()->Rank()}}};
+        }
       }
     } else if (intrinsic->name == "all" || intrinsic->name == "any" ||
         intrinsic->name == "count" || intrinsic->name == "iall" ||
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 4e50de3e7ee9..898b37504a6e 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -57,6 +57,7 @@
 #include "flang/Semantics/symbol.h"
 #include "flang/Semantics/tools.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
+#include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Transforms/RegionUtils.h"
@@ -3782,21 +3783,36 @@ private:
                            hlfir::Entity &lhs, hlfir::Entity &rhs) {
     bool lhsIsDevice = Fortran::evaluate::HasCUDAAttrs(assign.lhs);
     bool rhsIsDevice = Fortran::evaluate::HasCUDAAttrs(assign.rhs);
-    if (rhs.isBoxAddressOrValue() || lhs.isBoxAddressOrValue())
-      TODO(loc, "CUDA data transfler with descriptors");
+
+    auto getRefIfLoaded = [](mlir::Value val) -> mlir::Value {
+      if (auto loadOp =
+              mlir::dyn_cast_or_null<fir::LoadOp>(val.getDefiningOp()))
+        return loadOp.getMemref();
+      return val;
+    };
+
+    mlir::Value rhsVal = getRefIfLoaded(rhs.getBase());
+    mlir::Value lhsVal = getRefIfLoaded(lhs.getBase());
 
     // device = host
     if (lhsIsDevice && !rhsIsDevice) {
       auto transferKindAttr = cuf::DataTransferKindAttr::get(
           builder.getContext(), cuf::DataTransferKind::HostDevice);
       if (!rhs.isVariable()) {
-        auto associate = hlfir::genAssociateExpr(
-            loc, builder, rhs, rhs.getType(), ".cuf_host_tmp");
-        builder.create<cuf::DataTransferOp>(loc, associate.getBase(), lhs,
-                                            transferKindAttr);
-        builder.create<hlfir::EndAssociateOp>(loc, associate);
+        // Special case if the rhs is a constant.
+        if (matchPattern(rhs.getDefiningOp(), mlir::m_Constant())) {
+          builder.create<cuf::DataTransferOp>(loc, rhs, lhsVal,
+                                              transferKindAttr);
+        } else {
+          auto associate = hlfir::genAssociateExpr(
+              loc, builder, rhs, rhs.getType(), ".cuf_host_tmp");
+          builder.create<cuf::DataTransferOp>(loc, associate.getBase(), lhsVal,
+                                              transferKindAttr);
+          builder.create<hlfir::EndAssociateOp>(loc, associate);
+        }
       } else {
-        builder.create<cuf::DataTransferOp>(loc, rhs, lhs, transferKindAttr);
+        builder.create<cuf::DataTransferOp>(loc, rhsVal, lhsVal,
+                                            transferKindAttr);
       }
       return;
     }
@@ -3805,26 +3821,18 @@ private:
     if (!lhsIsDevice && rhsIsDevice) {
       auto transferKindAttr = cuf::DataTransferKindAttr::get(
           builder.getContext(), cuf::DataTransferKind::DeviceHost);
-      if (!rhs.isVariable()) {
-        // evaluateRhs loads scalar. Look for the memory reference to be used in
-        // the transfer.
-        if (mlir::isa_and_nonnull<fir::LoadOp>(rhs.getDefiningOp())) {
-          auto loadOp = mlir::dyn_cast<fir::LoadOp>(rhs.getDefiningOp());
-          builder.create<cuf::DataTransferOp>(loc, loadOp.getMemref(), lhs,
-                                              transferKindAttr);
-          return;
-        }
-      } else {
-        builder.create<cuf::DataTransferOp>(loc, rhs, lhs, transferKindAttr);
-      }
+      builder.create<cuf::DataTransferOp>(loc, rhsVal, lhsVal,
+                                          transferKindAttr);
       return;
     }
 
+    // device = device
     if (lhsIsDevice && rhsIsDevice) {
       assert(rhs.isVariable() && "CUDA Fortran assignment rhs is not legal");
       auto transferKindAttr = cuf::DataTransferKindAttr::get(
           builder.getContext(), cuf::DataTransferKind::DeviceDevice);
-      builder.create<cuf::DataTransferOp>(loc, rhs, lhs, transferKindAttr);
+      builder.create<cuf::DataTransferOp>(loc, rhsVal, lhsVal,
+                                          transferKindAttr);
       return;
     }
     llvm_unreachable("Unhandled CUDA data transfer");
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 875599098b3d..68619f699ebb 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -882,8 +882,11 @@ bool ClauseProcessor::processMap(
           // Explicit map captures are captured ByRef by default,
           // optimisation passes may alter this to ByCopy or other capture
           // types to optimise
+          auto location = mlir::NameLoc::get(
+              mlir::StringAttr::get(firOpBuilder.getContext(), asFortran.str()),
+              symAddr.getLoc());
           mlir::omp::MapInfoOp mapOp = createMapInfoOp(
-              firOpBuilder, clauseLocation, symAddr,
+              firOpBuilder, location, symAddr,
               /*varPtrPtr=*/mlir::Value{}, asFortran.str(), bounds,
               /*members=*/{}, /*membersIndex=*/mlir::DenseIntElementsAttr{},
               static_cast<
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 17b362cc2f32..1569605e785b 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1604,9 +1604,12 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
           mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO;
           mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
         }
-
+        auto location =
+            mlir::NameLoc::get(mlir::StringAttr::get(firOpBuilder.getContext(),
+                                                     sym.name().ToString()),
+                               baseOp.getLoc());
         mlir::Value mapOp = createMapInfoOp(
-            firOpBuilder, baseOp.getLoc(), baseOp, /*varPtrPtr=*/mlir::Value{},
+            firOpBuilder, location, baseOp, /*varPtrPtr=*/mlir::Value{},
             name.str(), bounds, /*members=*/{},
             /*membersIndex=*/mlir::DenseIntElementsAttr{},
             static_cast<
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index ae7e65098744..ad2f9236f0db 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -549,10 +549,18 @@ static constexpr IntrinsicHandler handlers[]{
        {"back", asValue, handleDynamicOptional},
        {"kind", asValue}}},
      /*isElemental=*/true},
+    {"selected_char_kind",
+     &I::genSelectedCharKind,
+     {{{"name", asAddr}}},
+     /*isElemental=*/false},
     {"selected_int_kind",
      &I::genSelectedIntKind,
      {{{"scalar", asAddr}}},
      /*isElemental=*/false},
+    {"selected_logical_kind",
+     &I::genSelectedLogicalKind,
+     {{{"bits", asAddr}}},
+     /*isElemental=*/false},
     {"selected_real_kind",
      &I::genSelectedRealKind,
      {{{"precision", asAddr, handleDynamicOptional},
@@ -5873,6 +5881,18 @@ IntrinsicLibrary::genScan(mlir::Type resultType,
   return readAndAddCleanUp(resultMutableBox, resultType, "SCAN");
 }
 
+// SELECTED_CHAR_KIND
+fir::ExtendedValue
+IntrinsicLibrary::genSelectedCharKind(mlir::Type resultType,
+                                      llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 1);
+
+  return builder.createConvert(
+      loc, resultType,
+      fir::runtime::genSelectedCharKind(builder, loc, fir::getBase(args[0]),
+                                        fir::getLen(args[0])));
+}
+
 // SELECTED_INT_KIND
 mlir::Value
 IntrinsicLibrary::genSelectedIntKind(mlir::Type resultType,
@@ -5884,6 +5904,17 @@ IntrinsicLibrary::genSelectedIntKind(mlir::Type resultType,
       fir::runtime::genSelectedIntKind(builder, loc, fir::getBase(args[0])));
 }
 
+// SELECTED_LOGICAL_KIND
+mlir::Value
+IntrinsicLibrary::genSelectedLogicalKind(mlir::Type resultType,
+                                         llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 1);
+
+  return builder.createConvert(loc, resultType,
+                               fir::runtime::genSelectedLogicalKind(
+                                   builder, loc, fir::getBase(args[0])));
+}
+
 // SELECTED_REAL_KIND
 mlir::Value
 IntrinsicLibrary::genSelectedRealKind(mlir::Type resultType,
diff --git a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
index 81d5d21ece7a..8ac9d64f576b 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Numeric.cpp
@@ -468,6 +468,26 @@ mlir::Value fir::runtime::genScale(fir::FirOpBuilder &builder,
   return builder.create<fir::CallOp>(loc, func, args).getResult(0);
 }
 
+/// Generate call to Selected_char_kind intrinsic runtime routine.
+mlir::Value fir::runtime::genSelectedCharKind(fir::FirOpBuilder &builder,
+                                              mlir::Location loc,
+                                              mlir::Value name,
+                                              mlir::Value length) {
+  mlir::func::FuncOp func =
+      fir::runtime::getRuntimeFunc<mkRTKey(SelectedCharKind)>(loc, builder);
+  auto fTy = func.getFunctionType();
+  auto sourceFile = fir::factory::locationToFilename(builder, loc);
+  auto sourceLine =
+      fir::factory::locationToLineNo(builder, loc, fTy.getInput(1));
+  if (!fir::isa_ref_type(name.getType()))
+    fir::emitFatalError(loc, "argument address for runtime not found");
+
+  auto args = fir::runtime::createArguments(builder, loc, fTy, sourceFile,
+                                            sourceLine, name, length);
+
+  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+}
+
 /// Generate call to Selected_int_kind intrinsic runtime routine.
 mlir::Value fir::runtime::genSelectedIntKind(fir::FirOpBuilder &builder,
                                              mlir::Location loc,
@@ -489,6 +509,27 @@ mlir::Value fir::runtime::genSelectedIntKind(fir::FirOpBuilder &builder,
   return builder.create<fir::CallOp>(loc, func, args).getResult(0);
 }
 
+/// Generate call to Selected_logical_kind intrinsic runtime routine.
+mlir::Value fir::runtime::genSelectedLogicalKind(fir::FirOpBuilder &builder,
+                                                 mlir::Location loc,
+                                                 mlir::Value x) {
+  mlir::func::FuncOp func =
+      fir::runtime::getRuntimeFunc<mkRTKey(SelectedLogicalKind)>(loc, builder);
+  auto fTy = func.getFunctionType();
+  auto sourceFile = fir::factory::locationToFilename(builder, loc);
+  auto sourceLine =
+      fir::factory::locationToLineNo(builder, loc, fTy.getInput(1));
+  if (!fir::isa_ref_type(x.getType()))
+    fir::emitFatalError(loc, "argument address for runtime not found");
+  mlir::Type eleTy = fir::unwrapRefType(x.getType());
+  mlir::Value xKind = builder.createIntegerConstant(
+      loc, fTy.getInput(3), eleTy.getIntOrFloatBitWidth() / 8);
+  auto args = fir::runtime::createArguments(builder, loc, fTy, sourceFile,
+                                            sourceLine, x, xKind);
+
+  return builder.create<fir::CallOp>(loc, func, args).getResult(0);
+}
+
 /// Generate call to Selected_real_kind intrinsic runtime routine.
 mlir::Value fir::runtime::genSelectedRealKind(fir::FirOpBuilder &builder,
                                               mlir::Location loc,
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 72172f63888e..74e68725003c 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -2716,6 +2716,18 @@ struct GlobalOpConversion : public fir::FIROpConversion<fir::GlobalOp> {
   mlir::LogicalResult
   matchAndRewrite(fir::GlobalOp global, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
+
+    mlir::LLVM::DIGlobalVariableExpressionAttr dbgExpr;
+
+    if (auto fusedLoc = mlir::dyn_cast<mlir::FusedLoc>(global.getLoc())) {
+      if (auto gvAttr =
+              mlir::dyn_cast_or_null<mlir::LLVM::DIGlobalVariableAttr>(
+                  fusedLoc.getMetadata())) {
+        dbgExpr = mlir::LLVM::DIGlobalVariableExpressionAttr::get(
+            global.getContext(), gvAttr, mlir::LLVM::DIExpressionAttr());
+      }
+    }
+
     auto tyAttr = convertType(global.getType());
     if (auto boxType = mlir::dyn_cast<fir::BaseBoxType>(global.getType()))
       tyAttr = this->lowerTy().convertBoxTypeAsStruct(boxType);
@@ -2724,8 +2736,11 @@ struct GlobalOpConversion : public fir::FIROpConversion<fir::GlobalOp> {
     assert(attributeTypeIsCompatible(global.getContext(), initAttr, tyAttr));
     auto linkage = convertLinkage(global.getLinkName());
     auto isConst = global.getConstant().has_value();
+    mlir::SymbolRefAttr comdat;
+    llvm::ArrayRef<mlir::NamedAttribute> attrs;
     auto g = rewriter.create<mlir::LLVM::GlobalOp>(
-        loc, tyAttr, isConst, linkage, global.getSymName(), initAttr);
+        loc, tyAttr, isConst, linkage, global.getSymName(), initAttr, 0, 0,
+        false, false, comdat, attrs, dbgExpr);
 
     auto module = global->getParentOfType<mlir::ModuleOp>();
     // Add comdat if necessary
@@ -2966,39 +2981,40 @@ struct SelectCaseOpConversion : public fir::FIROpConversion<fir::SelectCaseOp> {
           caseOp.getSuccessorOperands(adaptor.getOperands(), t);
       std::optional<mlir::ValueRange> cmpOps =
           *caseOp.getCompareOperands(adaptor.getOperands(), t);
-      mlir::Value caseArg = *(cmpOps.value().begin());
       mlir::Attribute attr = cases[t];
+      assert(mlir::isa<mlir::UnitAttr>(attr) || cmpOps.has_value());
       if (mlir::isa<fir::PointIntervalAttr>(attr)) {
         auto cmp = rewriter.create<mlir::LLVM::ICmpOp>(
-            loc, mlir::LLVM::ICmpPredicate::eq, selector, caseArg);
+            loc, mlir::LLVM::ICmpPredicate::eq, selector, cmpOps->front());
         genCaseLadderStep(loc, cmp, dest, destOps, rewriter);
         continue;
       }
       if (mlir::isa<fir::LowerBoundAttr>(attr)) {
         auto cmp = rewriter.create<mlir::LLVM::ICmpOp>(
-            loc, mlir::LLVM::ICmpPredicate::sle, caseArg, selector);
+            loc, mlir::LLVM::ICmpPredicate::sle, cmpOps->front(), selector);
         genCaseLadderStep(loc, cmp, dest, destOps, rewriter);
         continue;
       }
       if (mlir::isa<fir::UpperBoundAttr>(attr)) {
         auto cmp = rewriter.create<mlir::LLVM::ICmpOp>(
-            loc, mlir::LLVM::ICmpPredicate::sle, selector, caseArg);
+            loc, mlir::LLVM::ICmpPredicate::sle, selector, cmpOps->front());
         genCaseLadderStep(loc, cmp, dest, destOps, rewriter);
         continue;
       }
       if (mlir::isa<fir::ClosedIntervalAttr>(attr)) {
-        auto cmp = rewriter.create<mlir::LLVM::ICmpOp>(
-            loc, mlir::LLVM::ICmpPredicate::sle, caseArg, selector);
+        mlir::Value caseArg0 = *cmpOps->begin();
+        auto cmp0 = rewriter.create<mlir::LLVM::ICmpOp>(
+            loc, mlir::LLVM::ICmpPredicate::sle, caseArg0, selector);
         auto *thisBlock = rewriter.getInsertionBlock();
         auto *newBlock1 = createBlock(rewriter, dest);
         auto *newBlock2 = createBlock(rewriter, dest);
         rewriter.setInsertionPointToEnd(thisBlock);
-        rewriter.create<mlir::LLVM::CondBrOp>(loc, cmp, newBlock1, newBlock2);
+        rewriter.create<mlir::LLVM::CondBrOp>(loc, cmp0, newBlock1, newBlock2);
         rewriter.setInsertionPointToEnd(newBlock1);
-        mlir::Value caseArg0 = *(cmpOps.value().begin() + 1);
-        auto cmp0 = rewriter.create<mlir::LLVM::ICmpOp>(
-            loc, mlir::LLVM::ICmpPredicate::sle, selector, caseArg0);
-        genCondBrOp(loc, cmp0, dest, destOps, rewriter, newBlock2);
+        mlir::Value caseArg1 = *(cmpOps->begin() + 1);
+        auto cmp1 = rewriter.create<mlir::LLVM::ICmpOp>(
+            loc, mlir::LLVM::ICmpPredicate::sle, selector, caseArg1);
+        genCondBrOp(loc, cmp1, dest, destOps, rewriter, newBlock2);
         rewriter.setInsertionPointToEnd(newBlock2);
         continue;
       }
diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
index 870652c72fab..2c0c4c2cfae3 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
@@ -90,6 +90,24 @@ mlir::LogicalResult cuf::AllocateOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
+// DataTransferOp
+//===----------------------------------------------------------------------===//
+
+mlir::LogicalResult cuf::DataTransferOp::verify() {
+  mlir::Type srcTy = getSrc().getType();
+  mlir::Type dstTy = getDst().getType();
+  if ((fir::isa_ref_type(srcTy) && fir::isa_ref_type(dstTy)) ||
+      (fir::isa_box_type(srcTy) && fir::isa_box_type(dstTy)))
+    return mlir::success();
+  if (fir::isa_trivial(srcTy) &&
+      matchPattern(getSrc().getDefiningOp(), mlir::m_Constant()))
+    return mlir::success();
+  return emitOpError()
+         << "expect src and dst to be both references or descriptors or src to "
+            "be a constant";
+}
+
+//===----------------------------------------------------------------------===//
 // DeallocateOp
 //===----------------------------------------------------------------------===//
 
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
index 11196353b07c..218b38e9ba79 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
@@ -1115,7 +1115,7 @@ mlir::LogicalResult
 hlfir::MatmulOp::canonicalize(MatmulOp matmulOp,
                               mlir::PatternRewriter &rewriter) {
   // the only two uses of the transposed matrix should be for the hlfir.matmul
-  // and hlfir.destory
+  // and hlfir.destroy
   auto isOtherwiseUnused = [&](hlfir::TransposeOp transposeOp) -> bool {
     std::size_t numUses = 0;
     for (mlir::Operation *user : transposeOp.getResult().getUsers()) {
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp b/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp
index 06d051876384..6c8e3e119374 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/InlineElementals.cpp
@@ -32,7 +32,7 @@ namespace hlfir {
 } // namespace hlfir
 
 /// If the elemental has only two uses and those two are an apply operation and
-/// a destory operation, return those two, otherwise return {}
+/// a destroy operation, return those two, otherwise return {}
 static std::optional<std::pair<hlfir::ApplyOp, hlfir::DestroyOp>>
 getTwoUses(hlfir::ElementalOp elemental) {
   mlir::Operation::user_range users = elemental->getUsers();
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
index e9dbb7095d0e..707c0feffbb3 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
@@ -468,13 +468,6 @@ class LowerHLFIRIntrinsics
     : public hlfir::impl::LowerHLFIRIntrinsicsBase<LowerHLFIRIntrinsics> {
 public:
   void runOnOperation() override {
-    // TODO: make this a pass operating on FuncOp. The issue is that
-    // FirOpBuilder helpers may generate new FuncOp because of runtime/llvm
-    // intrinsics calls creation. This may create race conflict if the pass is
-    // scheduled on FuncOp. A solution could be to provide an optional mutex
-    // when building a FirOpBuilder and locking around FuncOp and GlobalOp
-    // creation, but this needs a bit more thinking, so at this point the pass
-    // is scheduled on the moduleOp.
     mlir::ModuleOp module = this->getOperation();
     mlir::MLIRContext *context = &getContext();
     mlir::RewritePatternSet patterns(context);
@@ -504,7 +497,3 @@ public:
   }
 };
 } // namespace
-
-std::unique_ptr<mlir::Pass> hlfir::createLowerHLFIRIntrinsicsPass() {
-  return std::make_unique<LowerHLFIRIntrinsics>();
-}
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
index c9ff4b1c3374..a1a89bb5154f 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp
@@ -1383,6 +1383,9 @@ class LowerHLFIROrderedAssignments
     : public hlfir::impl::LowerHLFIROrderedAssignmentsBase<
           LowerHLFIROrderedAssignments> {
 public:
+  using LowerHLFIROrderedAssignmentsBase<
+      LowerHLFIROrderedAssignments>::LowerHLFIROrderedAssignmentsBase;
+
   void runOnOperation() override {
     // Running on a ModuleOp because this pass may generate FuncOp declaration
     // for runtime calls. This could be a FuncOp pass otherwise.
@@ -1409,7 +1412,3 @@ public:
   }
 };
 } // namespace
-
-std::unique_ptr<mlir::Pass> hlfir::createLowerHLFIROrderedAssignmentsPass() {
-  return std::make_unique<LowerHLFIROrderedAssignments>();
-}
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
index 8d68c7021608..3c8424ca564e 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
@@ -1038,7 +1038,6 @@ class OptimizedBufferizationPass
           OptimizedBufferizationPass> {
 public:
   void runOnOperation() override {
-    mlir::func::FuncOp func = getOperation();
     mlir::MLIRContext *context = &getContext();
 
     mlir::GreedyRewriteConfig config;
@@ -1062,15 +1061,11 @@ public:
     patterns.insert<MinMaxlocElementalConversion<hlfir::MaxlocOp>>(context);
 
     if (mlir::failed(mlir::applyPatternsAndFoldGreedily(
-            func, std::move(patterns), config))) {
-      mlir::emitError(func.getLoc(),
+            getOperation(), std::move(patterns), config))) {
+      mlir::emitError(getOperation()->getLoc(),
                       "failure in HLFIR optimized bufferization");
       signalPassFailure();
     }
   }
 };
 } // namespace
-
-std::unique_ptr<mlir::Pass> hlfir::createOptimizedBufferizationPass() {
-  return std::make_unique<OptimizedBufferizationPass>();
-}
diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index 07e8aed4cd07..fb7c0bf0d1f9 100644
--- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -54,6 +54,16 @@ class AddDebugInfoPass : public fir::impl::AddDebugInfoBase<AddDebugInfoPass> {
 public:
   AddDebugInfoPass(fir::AddDebugInfoOptions options) : Base(options) {}
   void runOnOperation() override;
+
+private:
+  llvm::StringMap<mlir::LLVM::DIModuleAttr> moduleMap;
+
+  mlir::LLVM::DIModuleAttr getOrCreateModuleAttr(
+      const std::string &name, mlir::LLVM::DIFileAttr fileAttr,
+      mlir::LLVM::DIScopeAttr scope, unsigned line, bool decl);
+
+  void handleGlobalOp(fir::GlobalOp glocalOp, mlir::LLVM::DIFileAttr fileAttr,
+                      mlir::LLVM::DIScopeAttr scope);
 };
 
 static uint32_t getLineFromLoc(mlir::Location loc) {
@@ -99,6 +109,70 @@ void AddDebugInfoPass::handleDeclareOp(fir::cg::XDeclareOp declOp,
   declOp->setLoc(builder.getFusedLoc({declOp->getLoc()}, localVarAttr));
 }
 
+// The `module` does not have a first class representation in the `FIR`. We
+// extract information about it from the name of the identifiers and keep a
+// map to avoid duplication.
+mlir::LLVM::DIModuleAttr AddDebugInfoPass::getOrCreateModuleAttr(
+    const std::string &name, mlir::LLVM::DIFileAttr fileAttr,
+    mlir::LLVM::DIScopeAttr scope, unsigned line, bool decl) {
+  mlir::MLIRContext *context = &getContext();
+  mlir::LLVM::DIModuleAttr modAttr;
+  if (auto iter{moduleMap.find(name)}; iter != moduleMap.end()) {
+    modAttr = iter->getValue();
+  } else {
+    modAttr = mlir::LLVM::DIModuleAttr::get(
+        context, fileAttr, scope, mlir::StringAttr::get(context, name),
+        /* configMacros */ mlir::StringAttr(),
+        /* includePath */ mlir::StringAttr(),
+        /* apinotes */ mlir::StringAttr(), line, decl);
+    moduleMap[name] = modAttr;
+  }
+  return modAttr;
+}
+
+void AddDebugInfoPass::handleGlobalOp(fir::GlobalOp globalOp,
+                                      mlir::LLVM::DIFileAttr fileAttr,
+                                      mlir::LLVM::DIScopeAttr scope) {
+  mlir::ModuleOp module = getOperation();
+  mlir::MLIRContext *context = &getContext();
+  fir::DebugTypeGenerator typeGen(module);
+  mlir::OpBuilder builder(context);
+
+  std::pair result = fir::NameUniquer::deconstruct(globalOp.getSymName());
+  if (result.first != fir::NameUniquer::NameKind::VARIABLE)
+    return;
+
+  unsigned line = getLineFromLoc(globalOp.getLoc());
+
+  // DWARF5 says following about the fortran modules:
+  // A Fortran 90 module may also be represented by a module entry
+  // (but no declaration attribute is warranted because Fortran has no concept
+  // of a corresponding module body).
+  // But in practice, compilers use declaration attribute with a module in cases
+  // where module was defined in another source file (only being used in this
+  // one). The isInitialized() seems to provide the right information
+  // but inverted. It is true where module is actually defined but false where
+  // it is used.
+  // FIXME: Currently we don't have the line number on which a module was
+  // declared. We are using a best guess of line - 1 where line is the source
+  // line of the first member of the module that we encounter.
+
+  if (result.second.modules.empty())
+    return;
+
+  scope = getOrCreateModuleAttr(result.second.modules[0], fileAttr, scope,
+                                line - 1, !globalOp.isInitialized());
+
+  mlir::LLVM::DITypeAttr diType = typeGen.convertType(
+      globalOp.getType(), fileAttr, scope, globalOp.getLoc());
+  auto gvAttr = mlir::LLVM::DIGlobalVariableAttr::get(
+      context, scope, mlir::StringAttr::get(context, result.second.name),
+      mlir::StringAttr::get(context, globalOp.getName()), fileAttr, line,
+      diType, /*isLocalToUnit*/ false,
+      /*isDefinition*/ globalOp.isInitialized(), /* alignInBits*/ 0);
+  globalOp->setLoc(builder.getFusedLoc({globalOp->getLoc()}, gvAttr));
+}
+
 void AddDebugInfoPass::runOnOperation() {
   mlir::ModuleOp module = getOperation();
   mlir::MLIRContext *context = &getContext();
@@ -138,6 +212,12 @@ void AddDebugInfoPass::runOnOperation() {
       llvm::dwarf::getLanguage("DW_LANG_Fortran95"), fileAttr, producer,
       isOptimized, debugLevel);
 
+  if (debugLevel == mlir::LLVM::DIEmissionKind::Full) {
+    // Process 'GlobalOp' only if full debug info is requested.
+    for (auto globalOp : module.getOps<fir::GlobalOp>())
+      handleGlobalOp(globalOp, fileAttr, cuAttr);
+  }
+
   module.walk([&](mlir::func::FuncOp funcOp) {
     mlir::Location l = funcOp->getLoc();
     // If fused location has already been created then nothing to do
@@ -180,6 +260,7 @@ void AddDebugInfoPass::runOnOperation() {
 
     // Only definitions need a distinct identifier and a compilation unit.
     mlir::DistinctAttr id;
+    mlir::LLVM::DIScopeAttr Scope = fileAttr;
     mlir::LLVM::DICompileUnitAttr compilationUnit;
     mlir::LLVM::DISubprogramFlags subprogramFlags =
         mlir::LLVM::DISubprogramFlags{};
@@ -192,9 +273,13 @@ void AddDebugInfoPass::runOnOperation() {
           subprogramFlags | mlir::LLVM::DISubprogramFlags::Definition;
     }
     unsigned line = getLineFromLoc(l);
+    if (!result.second.modules.empty())
+      Scope = getOrCreateModuleAttr(result.second.modules[0], fileAttr, cuAttr,
+                                    line - 1, false);
+
     auto spAttr = mlir::LLVM::DISubprogramAttr::get(
-        context, id, compilationUnit, fileAttr, funcName, fullName,
-        funcFileAttr, line, line, subprogramFlags, subTypeAttr);
+        context, id, compilationUnit, Scope, funcName, fullName, funcFileAttr,
+        line, line, subprogramFlags, subTypeAttr);
     funcOp->setLoc(builder.getFusedLoc({funcOp->getLoc()}, spAttr));
 
     // Don't process variables if user asked for line tables only.
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
index 64c6547e06e0..07163de958f9 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
@@ -37,6 +37,45 @@ static mlir::LLVM::DITypeAttr genPlaceholderType(mlir::MLIRContext *context) {
                       llvm::dwarf::DW_ATE_signed);
 }
 
+mlir::LLVM::DITypeAttr DebugTypeGenerator::convertSequenceType(
+    fir::SequenceType seqTy, mlir::LLVM::DIFileAttr fileAttr,
+    mlir::LLVM::DIScopeAttr scope, mlir::Location loc) {
+
+  mlir::MLIRContext *context = module.getContext();
+  // FIXME: Only fixed sizes arrays handled at the moment.
+  if (seqTy.hasDynamicExtents())
+    return genPlaceholderType(context);
+
+  llvm::SmallVector<mlir::LLVM::DINodeAttr> elements;
+  mlir::LLVM::DITypeAttr elemTy =
+      convertType(seqTy.getEleTy(), fileAttr, scope, loc);
+
+  for (fir::SequenceType::Extent dim : seqTy.getShape()) {
+    auto intTy = mlir::IntegerType::get(context, 64);
+    // FIXME: Only supporting lower bound of 1 at the moment. The
+    // 'SequenceType' has information about the shape but not the shift. In
+    // cases where the conversion originated during the processing of
+    // 'DeclareOp', it may be possible to pass on this information. But the
+    // type conversion should ideally be based on what information present in
+    // the type class so that it works from everywhere (e.g. when it is part
+    // of a module or a derived type.)
+    auto countAttr = mlir::IntegerAttr::get(intTy, llvm::APInt(64, dim));
+    auto lowerAttr = mlir::IntegerAttr::get(intTy, llvm::APInt(64, 1));
+    auto subrangeTy = mlir::LLVM::DISubrangeAttr::get(
+        context, countAttr, lowerAttr, nullptr, nullptr);
+    elements.push_back(subrangeTy);
+  }
+  // Apart from arrays, the `DICompositeTypeAttr` is used for other things like
+  // structure types. Many of its fields which are not applicable to arrays
+  // have been set to some valid default values.
+
+  return mlir::LLVM::DICompositeTypeAttr::get(
+      context, llvm::dwarf::DW_TAG_array_type, /*recursive id*/ {},
+      /* name */ nullptr, /* file */ nullptr, /* line */ 0, /* scope */ nullptr,
+      elemTy, mlir::LLVM::DIFlags::Zero, /* sizeInBits */ 0,
+      /*alignInBits*/ 0, elements);
+}
+
 mlir::LLVM::DITypeAttr
 DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr,
                                 mlir::LLVM::DIScopeAttr scope,
@@ -57,6 +96,20 @@ DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr,
                         mlir::StringAttr::get(context, logTy.getMnemonic()),
                         kindMapping.getLogicalBitsize(logTy.getFKind()),
                         llvm::dwarf::DW_ATE_boolean);
+  } else if (fir::isa_complex(Ty)) {
+    unsigned bitWidth;
+    if (auto cplxTy = mlir::dyn_cast_or_null<mlir::ComplexType>(Ty)) {
+      auto floatTy = mlir::cast<mlir::FloatType>(cplxTy.getElementType());
+      bitWidth = floatTy.getWidth();
+    } else if (auto cplxTy = mlir::dyn_cast_or_null<fir::ComplexType>(Ty)) {
+      bitWidth = kindMapping.getRealBitsize(cplxTy.getFKind());
+    } else {
+      llvm_unreachable("Unhandled complex type");
+    }
+    return genBasicType(context, mlir::StringAttr::get(context, "complex"),
+                        bitWidth * 2, llvm::dwarf::DW_ATE_complex_float);
+  } else if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(Ty)) {
+    return convertSequenceType(seqTy, fileAttr, scope, loc);
   } else {
     // FIXME: These types are currently unhandled. We are generating a
     // placeholder type to allow us to test supported bits.
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
index 5a2bb201db47..963c919d6682 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
@@ -31,6 +31,10 @@ public:
                                      mlir::Location loc);
 
 private:
+  mlir::LLVM::DITypeAttr convertSequenceType(fir::SequenceType seqTy,
+                                             mlir::LLVM::DIFileAttr fileAttr,
+                                             mlir::LLVM::DIScopeAttr scope,
+                                             mlir::Location loc);
   mlir::ModuleOp module;
   KindMapping kindMapping;
 };
diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index 8f51ef5ebeba..48c888c0dfb2 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -761,7 +761,8 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
   }
 
   // 15.5.2.5 -- actual & dummy are both POINTER or both ALLOCATABLE
-  // For INTENT(IN) we relax two checks that are in Fortran to
+  // For INTENT(IN), and for a polymorphic actual being associated with a
+  // monomorphic dummy, we relax two checks that are in Fortran to
   // prevent the callee from changing the type or to avoid having
   // to use a descriptor.
   if (!typesCompatible) {
@@ -770,7 +771,9 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
       (actualIsAllocatable && dummyIsAllocatable)) {
     bool actualIsUnlimited{actualType.type().IsUnlimitedPolymorphic()};
     bool dummyIsUnlimited{dummy.type.type().IsUnlimitedPolymorphic()};
+    bool checkTypeCompatibility{true};
     if (actualIsUnlimited != dummyIsUnlimited) {
+      checkTypeCompatibility = false;
       if (dummyIsUnlimited && dummy.intent == common::Intent::In &&
           context.IsEnabled(common::LanguageFeature::RelaxedIntentInChecking)) {
         if (context.ShouldWarn(
@@ -790,11 +793,21 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
           messages.Say(
               "If a POINTER or ALLOCATABLE dummy or actual argument is polymorphic, both should be so"_port_en_US);
         }
+      } else if (actualIsPolymorphic &&
+          context.IsEnabled(common::LanguageFeature::
+                  PolymorphicActualAllocatableOrPointerToMonomorphicDummy)) {
+        if (context.ShouldWarn(common::LanguageFeature::
+                    PolymorphicActualAllocatableOrPointerToMonomorphicDummy)) {
+          messages.Say(
+              "If a POINTER or ALLOCATABLE actual argument is polymorphic, the corresponding dummy argument should also be so"_port_en_US);
+        }
       } else {
+        checkTypeCompatibility = false;
         messages.Say(
             "If a POINTER or ALLOCATABLE dummy or actual argument is polymorphic, both must be so"_err_en_US);
       }
-    } else if (!actualIsUnlimited) {
+    }
+    if (checkTypeCompatibility && !actualIsUnlimited) {
       if (!actualType.type().IsTkCompatibleWith(dummy.type.type())) {
         if (dummy.intent == common::Intent::In &&
             context.IsEnabled(
@@ -1116,20 +1129,20 @@ static void CheckExplicitInterfaceArg(evaluate::ActualArgument &arg,
   }
   auto restorer{
       messages.SetLocation(arg.sourceLocation().value_or(messages.at()))};
-  auto checkActualArgForLabel = [&](evaluate::ActualArgument &arg) {
+  auto CheckActualArgForLabel = [&](evaluate::ActualArgument &arg) {
     if (arg.isAlternateReturn()) {
       messages.Say(
           "Alternate return label '%d' cannot be associated with %s"_err_en_US,
           arg.GetLabel(), dummyName);
-      return true;
-    } else {
       return false;
+    } else {
+      return true;
     }
   };
   common::visit(
       common::visitors{
           [&](const characteristics::DummyDataObject &object) {
-            if (!checkActualArgForLabel(arg)) {
+            if (CheckActualArgForLabel(arg)) {
               ConvertBOZLiteralArg(arg, object.type.type());
               if (auto *expr{arg.UnwrapExpr()}) {
                 if (auto type{characteristics::TypeAndShape::Characterize(
@@ -1147,9 +1160,16 @@ static void CheckExplicitInterfaceArg(evaluate::ActualArgument &arg,
                     evaluate::IsNullObjectPointer(*expr)) {
                   // ok, ASSOCIATED(NULL(without MOLD=))
                 } else if (object.type.attrs().test(characteristics::
-                                   TypeAndShape::Attr::AssumedRank)) {
+                                   TypeAndShape::Attr::AssumedRank) &&
+                    evaluate::IsNullObjectPointer(*expr) &&
+                    (object.attrs.test(
+                         characteristics::DummyDataObject::Attr::Allocatable) ||
+                        object.attrs.test(
+                            characteristics::DummyDataObject::Attr::Pointer) ||
+                        !object.attrs.test(characteristics::DummyDataObject::
+                                Attr::Optional))) {
                   messages.Say(
-                      "NULL() without MOLD= must not be associated with an assumed-rank dummy argument"_err_en_US);
+                      "NULL() without MOLD= must not be associated with an assumed-rank dummy argument that is ALLOCATABLE, POINTER, or non-OPTIONAL"_err_en_US);
                 } else if ((object.attrs.test(characteristics::DummyDataObject::
                                     Attr::Pointer) ||
                                object.attrs.test(characteristics::
@@ -1210,7 +1230,7 @@ static void CheckExplicitInterfaceArg(evaluate::ActualArgument &arg,
             }
           },
           [&](const characteristics::DummyProcedure &dummy) {
-            if (!checkActualArgForLabel(arg)) {
+            if (CheckActualArgForLabel(arg)) {
               CheckProcedureArg(arg, proc, dummy, dummyName, context,
                   ignoreImplicitVsExplicit);
             }
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index f564a0b69671..7034902dcc58 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -2430,16 +2430,18 @@ void CheckHelper::CheckProcBinding(
                   "A NOPASS type-bound procedure and its override must have identical interfaces"_err_en_US);
             }
           } else if (!context_.HasError(binding.symbol())) {
-            int passIndex{bindingChars->FindPassIndex(binding.passName())};
-            int overriddenPassIndex{
+            auto passIndex{bindingChars->FindPassIndex(binding.passName())};
+            auto overriddenPassIndex{
                 overriddenChars->FindPassIndex(overriddenBinding->passName())};
-            if (passIndex != overriddenPassIndex) {
-              SayWithDeclaration(*overridden,
-                  "A type-bound procedure and its override must use the same PASS argument"_err_en_US);
-            } else if (!bindingChars->CanOverride(
-                           *overriddenChars, passIndex)) {
-              SayWithDeclaration(*overridden,
-                  "A type-bound procedure and its override must have compatible interfaces"_err_en_US);
+            if (passIndex && overriddenPassIndex) {
+              if (*passIndex != *overriddenPassIndex) {
+                SayWithDeclaration(*overridden,
+                    "A type-bound procedure and its override must use the same PASS argument"_err_en_US);
+              } else if (!bindingChars->CanOverride(
+                             *overriddenChars, passIndex)) {
+                SayWithDeclaration(*overridden,
+                    "A type-bound procedure and its override must have compatible interfaces"_err_en_US);
+              }
             }
           }
         }
@@ -2960,32 +2962,6 @@ parser::Messages CheckHelper::WhyNotInteroperableDerivedType(
   return msgs;
 }
 
-static UnorderedSymbolSet CollectEntryPointsWithDummy(const Symbol &dummy) {
-  UnorderedSymbolSet entries;
-  const Scope &subpScope{dummy.owner()};
-  for (const auto &[_, ref] : subpScope.parent()) {
-    const Symbol &x{*ref};
-    if (const auto *subp{x.detailsIf<SubprogramDetails>()}) {
-      if (x.scope() == &subpScope || subp->entryScope() == &dummy.owner()) {
-        if (std::find(subp->dummyArgs().begin(), subp->dummyArgs().end(),
-                &dummy) != subp->dummyArgs().end()) {
-          entries.insert(x);
-        }
-      }
-    }
-  }
-  return entries;
-}
-
-static bool AnyNonBindCEntry(const Symbol &dummy) {
-  for (const Symbol &subp : CollectEntryPointsWithDummy(dummy)) {
-    if (!subp.attrs().test(Attr::BIND_C)) {
-      return true;
-    }
-  }
-  return false;
-}
-
 parser::Messages CheckHelper::WhyNotInteroperableObject(
     const Symbol &symbol, bool isError) {
   parser::Messages msgs;
@@ -2998,14 +2974,14 @@ parser::Messages CheckHelper::WhyNotInteroperableObject(
   examinedByWhyNotInteroperable_.insert(symbol);
   CHECK(symbol.has<ObjectEntityDetails>());
   if (isExplicitBindC && !symbol.owner().IsModule()) {
-    messages_.Say(symbol.name(),
+    msgs.Say(symbol.name(),
         "A variable with BIND(C) attribute may only appear in the specification part of a module"_err_en_US);
   }
   auto shape{evaluate::GetShape(foldingContext_, symbol)};
   if (shape) {
     if (evaluate::GetRank(*shape) == 0) { // 18.3.4
       if (IsAllocatableOrPointer(symbol) && !IsDummy(symbol)) {
-        messages_.Say(symbol.name(),
+        msgs.Say(symbol.name(),
             "A scalar interoperable variable may not be ALLOCATABLE or POINTER"_err_en_US);
       }
     } else if (auto extents{
@@ -3026,33 +3002,26 @@ parser::Messages CheckHelper::WhyNotInteroperableObject(
     if (derived) {
       if (derived->typeSymbol().attrs().test(Attr::BIND_C)) {
       } else if (isError) {
-        if (auto *msg{messages_.Say(symbol.name(),
-                "The derived type of a BIND(C) object must also be BIND(C)"_err_en_US)}) {
-          msg->Attach(derived->typeSymbol().name(), "Non-BIND(C) type"_en_US);
-        }
-        context_.SetError(symbol);
+        msgs.Say(symbol.name(),
+                "The derived type of a BIND(C) object must also be BIND(C)"_err_en_US)
+            .Attach(derived->typeSymbol().name(), "Non-BIND(C) type"_en_US);
       } else if (auto bad{WhyNotInteroperableDerivedType(
                      derived->typeSymbol(), /*isError=*/false)};
                  bad.AnyFatalError()) {
-        if (auto *msg{messages_.Say(symbol.name(),
-                "The derived type of an interoperable object must be interoperable, but is not"_err_en_US)}) {
-          msg->Attach(
-              derived->typeSymbol().name(), "Non-interoperable type"_en_US);
-          bad.AttachTo(*msg, parser::Severity::None);
-        }
+        bad.AttachTo(
+            msgs.Say(symbol.name(),
+                    "The derived type of an interoperable object must be interoperable, but is not"_err_en_US)
+                .Attach(derived->typeSymbol().name(),
+                    "Non-interoperable type"_en_US),
+            parser::Severity::None);
       } else {
-        if (auto *msg{messages_.Say(symbol.name(),
-                "The derived type of an interoperable object should be BIND(C)"_warn_en_US)}) {
-          msg->Attach(derived->typeSymbol().name(), "Non-BIND(C) type"_en_US);
-        }
+        msgs.Say(symbol.name(),
+                "The derived type of an interoperable object should be BIND(C)"_warn_en_US)
+            .Attach(derived->typeSymbol().name(), "Non-BIND(C) type"_en_US);
       }
     }
     if (type->IsAssumedType()) { // ok
     } else if (IsAssumedLengthCharacter(symbol)) {
-      if (AnyNonBindCEntry(symbol)) {
-        msgs.Say(symbol.name(),
-            "An assumed-length dummy argument must not appear in a non-BIND(C) entry in a subprogram with an entry that must be interoperable"_err_en_US);
-      }
     } else if (IsAllocatableOrPointer(symbol) &&
         type->category() == DeclTypeSpec::Character &&
         type->characterTypeSpec().length().isDeferred()) {
@@ -3083,12 +3052,6 @@ parser::Messages CheckHelper::WhyNotInteroperableObject(
     msgs.Say(symbol.name(),
         "An interoperable procedure with an OPTIONAL dummy argument might not be portable"_port_en_US);
   }
-  if (symbol.attrs().test(Attr::VALUE)) {
-    if (AnyNonBindCEntry(symbol)) {
-      msgs.Say(symbol.name(),
-          "A VALUE dummy argument must not appear in a non-BIND(C) entry of a subprogram with an entry that must be interoperable"_err_en_US);
-    }
-  }
   if (IsDescriptor(symbol) && IsPointer(symbol) &&
       symbol.attrs().test(Attr::CONTIGUOUS)) {
     msgs.Say(symbol.name(),
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index e9637b7bb591..5e3a5725c18d 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -2310,6 +2310,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Reduction &x) {
   if (CheckReductionOperators(x)) {
     CheckReductionTypeList(x);
   }
+  CheckReductionModifier(x);
 }
 
 bool OmpStructureChecker::CheckReductionOperators(
@@ -2394,6 +2395,64 @@ void OmpStructureChecker::CheckReductionTypeList(
   }
 }
 
+void OmpStructureChecker::CheckReductionModifier(
+    const parser::OmpClause::Reduction &x) {
+  using ReductionModifier = parser::OmpReductionClause::ReductionModifier;
+  const auto &maybeModifier{std::get<std::optional<ReductionModifier>>(x.v.t)};
+  if (!maybeModifier || *maybeModifier == ReductionModifier::Default) {
+    // No modifier, or the default one is always ok.
+    return;
+  }
+  ReductionModifier modifier{*maybeModifier};
+  const DirectiveContext &dirCtx{GetContext()};
+  if (dirCtx.directive == llvm::omp::Directive::OMPD_loop) {
+    // [5.2:257:33-34]
+    // If a reduction-modifier is specified in a reduction clause that
+    // appears on the directive, then the reduction modifier must be
+    // default.
+    context_.Say(GetContext().clauseSource,
+        "REDUCTION modifier on LOOP directive must be DEFAULT"_err_en_US);
+  }
+  if (modifier == ReductionModifier::Task) {
+    // "Task" is only allowed on worksharing or "parallel" directive.
+    static llvm::omp::Directive worksharing[]{
+        llvm::omp::Directive::OMPD_do, llvm::omp::Directive::OMPD_scope,
+        llvm::omp::Directive::OMPD_sections,
+        // There are more worksharing directives, but they do not apply:
+        // "for" is C++ only,
+        // "single" and "workshare" don't allow reduction clause,
+        // "loop" has different restrictions (checked above).
+    };
+    if (dirCtx.directive != llvm::omp::Directive::OMPD_parallel &&
+        !llvm::is_contained(worksharing, dirCtx.directive)) {
+      context_.Say(GetContext().clauseSource,
+          "Modifier 'TASK' on REDUCTION clause is only allowed with "
+          "PARALLEL or worksharing directive"_err_en_US);
+    }
+  } else if (modifier == ReductionModifier::Inscan) {
+    // "Inscan" is only allowed on worksharing-loop, worksharing-loop simd,
+    // or "simd" directive.
+    // The worksharing-loop directives are OMPD_do and OMPD_for. Only the
+    // former is allowed in Fortran.
+    switch (dirCtx.directive) {
+    case llvm::omp::Directive::OMPD_do: // worksharing-loop
+    case llvm::omp::Directive::OMPD_do_simd: // worksharing-loop simd
+    case llvm::omp::Directive::OMPD_simd: // "simd"
+      break;
+    default:
+      context_.Say(GetContext().clauseSource,
+          "Modifier 'INSCAN' on REDUCTION clause is only allowed with "
+          "worksharing-loop, worksharing-loop simd, "
+          "or SIMD directive"_err_en_US);
+    }
+  } else {
+    // Catch-all for potential future modifiers to make sure that this
+    // function is up-to-date.
+    context_.Say(GetContext().clauseSource,
+        "Unexpected modifier on REDUCTION clause"_err_en_US);
+  }
+}
+
 void OmpStructureChecker::CheckIntentInPointerAndDefinable(
     const parser::OmpObjectList &objectList, const llvm::omp::Clause clause) {
   for (const auto &ompObject : objectList.v) {
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index 1f7284307703..47705771e8e2 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -205,6 +205,7 @@ private:
   bool CheckIntrinsicOperator(
       const parser::DefinedOperator::IntrinsicOperator &);
   void CheckReductionTypeList(const parser::OmpClause::Reduction &);
+  void CheckReductionModifier(const parser::OmpClause::Reduction &);
   void CheckMasterNesting(const parser::OpenMPBlockConstruct &x);
   void ChecksOnOrderedAsBlock();
   void CheckBarrierNesting(const parser::OpenMPSimpleStandaloneConstruct &x);
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 06e38da6626a..50e2b41212d7 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -1600,16 +1600,23 @@ private:
       parser::CharBlock name, std::int64_t lower, std::int64_t upper,
       std::int64_t stride);
 
-  template <int KIND, typename A>
-  std::optional<Expr<Type<TypeCategory::Integer, KIND>>> GetSpecificIntExpr(
-      const A &x) {
-    if (MaybeExpr y{exprAnalyzer_.Analyze(x)}) {
+  template <int KIND>
+  std::optional<Expr<Type<TypeCategory::Integer, KIND>>> ToSpecificInt(
+      MaybeExpr &&y) {
+    if (y) {
       Expr<SomeInteger> *intExpr{UnwrapExpr<Expr<SomeInteger>>(*y)};
       return Fold(exprAnalyzer_.GetFoldingContext(),
           ConvertToType<Type<TypeCategory::Integer, KIND>>(
               std::move(DEREF(intExpr))));
+    } else {
+      return std::nullopt;
     }
-    return std::nullopt;
+  }
+
+  template <int KIND, typename A>
+  std::optional<Expr<Type<TypeCategory::Integer, KIND>>> GetSpecificIntExpr(
+      const A &x) {
+    return ToSpecificInt<KIND>(exprAnalyzer_.Analyze(x));
   }
 
   // Nested array constructors all reference the same ExpressionAnalyzer,
@@ -1772,26 +1779,45 @@ void ArrayConstructorContext::Add(const parser::AcValue &x) {
 
 // Transforms l:u(:s) into (_,_=l,u(,s)) with an anonymous index '_'
 void ArrayConstructorContext::Add(const parser::AcValue::Triplet &triplet) {
-  std::optional<Expr<ImpliedDoIntType>> lower{
-      GetSpecificIntExpr<ImpliedDoIntType::kind>(std::get<0>(triplet.t))};
-  std::optional<Expr<ImpliedDoIntType>> upper{
-      GetSpecificIntExpr<ImpliedDoIntType::kind>(std::get<1>(triplet.t))};
-  std::optional<Expr<ImpliedDoIntType>> stride{
-      GetSpecificIntExpr<ImpliedDoIntType::kind>(std::get<2>(triplet.t))};
-  if (lower && upper) {
-    if (!stride) {
-      stride = Expr<ImpliedDoIntType>{1};
-    }
-    if (!type_) {
-      type_ = DynamicTypeWithLength{ImpliedDoIntType::GetType()};
+  MaybeExpr lowerExpr{exprAnalyzer_.Analyze(std::get<0>(triplet.t))};
+  MaybeExpr upperExpr{exprAnalyzer_.Analyze(std::get<1>(triplet.t))};
+  MaybeExpr strideExpr{exprAnalyzer_.Analyze(std::get<2>(triplet.t))};
+  if (lowerExpr && upperExpr) {
+    auto lowerType{lowerExpr->GetType()};
+    auto upperType{upperExpr->GetType()};
+    auto strideType{strideExpr ? strideExpr->GetType() : lowerType};
+    if (lowerType && upperType && strideType) {
+      int kind{lowerType->kind()};
+      if (upperType->kind() > kind) {
+        kind = upperType->kind();
+      }
+      if (strideType->kind() > kind) {
+        kind = strideType->kind();
+      }
+      auto lower{ToSpecificInt<ImpliedDoIntType::kind>(std::move(lowerExpr))};
+      auto upper{ToSpecificInt<ImpliedDoIntType::kind>(std::move(upperExpr))};
+      if (lower && upper) {
+        auto stride{
+            ToSpecificInt<ImpliedDoIntType::kind>(std::move(strideExpr))};
+        if (!stride) {
+          stride = Expr<ImpliedDoIntType>{1};
+        }
+        DynamicType type{TypeCategory::Integer, kind};
+        if (!type_) {
+          type_ = DynamicTypeWithLength{type};
+        }
+        parser::CharBlock anonymous;
+        if (auto converted{ConvertToType(type,
+                AsGenericExpr(
+                    Expr<ImpliedDoIntType>{ImpliedDoIndex{anonymous}}))}) {
+          auto v{std::move(values_)};
+          Push(std::move(converted));
+          std::swap(v, values_);
+          values_.Push(ImpliedDo<SomeType>{anonymous, std::move(*lower),
+              std::move(*upper), std::move(*stride), std::move(v)});
+        }
+      }
     }
-    auto v{std::move(values_)};
-    parser::CharBlock anonymous;
-    Push(Expr<SomeType>{
-        Expr<SomeInteger>{Expr<ImpliedDoIntType>{ImpliedDoIndex{anonymous}}}});
-    std::swap(v, values_);
-    values_.Push(ImpliedDo<SomeType>{anonymous, std::move(*lower),
-        std::move(*upper), std::move(*stride), std::move(v)});
   }
 }
 
diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp
index bb8c6c7567b8..d7f149467dd7 100644
--- a/flang/lib/Semantics/mod-file.cpp
+++ b/flang/lib/Semantics/mod-file.cpp
@@ -46,11 +46,11 @@ struct ModHeader {
 };
 
 static std::optional<SourceName> GetSubmoduleParent(const parser::Program &);
-static void CollectSymbols(const Scope &, SymbolVector &, SymbolVector &,
-    std::map<const Symbol *, SourceName> &, UnorderedSymbolSet &);
+static void CollectSymbols(
+    const Scope &, SymbolVector &, SymbolVector &, UnorderedSymbolSet &);
 static void PutPassName(llvm::raw_ostream &, const std::optional<SourceName> &);
 static void PutInit(llvm::raw_ostream &, const Symbol &, const MaybeExpr &,
-    const parser::Expr *, const std::map<const Symbol *, SourceName> &);
+    const parser::Expr *);
 static void PutInit(llvm::raw_ostream &, const MaybeIntExpr &);
 static void PutBound(llvm::raw_ostream &, const Bound &);
 static void PutShapeSpec(llvm::raw_ostream &, const ShapeSpec &);
@@ -200,47 +200,105 @@ std::string ModFileWriter::GetAsString(const Symbol &symbol) {
   return all.str();
 }
 
-// Collect symbols from initializations that are being referenced directly
-// from other modules; they may require new USE associations.
-static void HarvestInitializerSymbols(
-    SourceOrderedSymbolSet &set, const Scope &scope) {
-  for (const auto &[_, symbol] : scope) {
-    if (symbol->has<DerivedTypeDetails>()) {
-      if (symbol->scope()) {
-        HarvestInitializerSymbols(set, *symbol->scope());
+// Collect symbols from constant and specification expressions that are being
+// referenced directly from other modules; they may require new USE
+// associations.
+static void HarvestSymbolsNeededFromOtherModules(
+    SourceOrderedSymbolSet &, const Scope &);
+static void HarvestSymbolsNeededFromOtherModules(
+    SourceOrderedSymbolSet &set, const Symbol &symbol, const Scope &scope) {
+  auto HarvestBound{[&](const Bound &bound) {
+    if (const auto &expr{bound.GetExplicit()}) {
+      for (SymbolRef ref : evaluate::CollectSymbols(*expr)) {
+        set.emplace(*ref);
       }
-    } else if (const auto &generic{symbol->detailsIf<GenericDetails>()};
-               generic && generic->derivedType()) {
-      const Symbol &dtSym{*generic->derivedType()};
-      if (dtSym.has<DerivedTypeDetails>()) {
-        if (dtSym.scope()) {
-          HarvestInitializerSymbols(set, *dtSym.scope());
-        }
-      } else {
-        CHECK(dtSym.has<UseDetails>() || dtSym.has<UseErrorDetails>());
+    }
+  }};
+  auto HarvestShapeSpec{[&](const ShapeSpec &shapeSpec) {
+    HarvestBound(shapeSpec.lbound());
+    HarvestBound(shapeSpec.ubound());
+  }};
+  auto HarvestArraySpec{[&](const ArraySpec &arraySpec) {
+    for (const auto &shapeSpec : arraySpec) {
+      HarvestShapeSpec(shapeSpec);
+    }
+  }};
+
+  if (symbol.has<DerivedTypeDetails>()) {
+    if (symbol.scope()) {
+      HarvestSymbolsNeededFromOtherModules(set, *symbol.scope());
+    }
+  } else if (const auto &generic{symbol.detailsIf<GenericDetails>()};
+             generic && generic->derivedType()) {
+    const Symbol &dtSym{*generic->derivedType()};
+    if (dtSym.has<DerivedTypeDetails>()) {
+      if (dtSym.scope()) {
+        HarvestSymbolsNeededFromOtherModules(set, *dtSym.scope());
       }
-    } else if (IsNamedConstant(*symbol) || scope.IsDerivedType()) {
-      if (const auto *object{symbol->detailsIf<ObjectEntityDetails>()}) {
-        if (object->init()) {
-          for (SymbolRef ref : evaluate::CollectSymbols(*object->init())) {
-            set.emplace(*ref);
-          }
-        }
-      } else if (const auto *proc{symbol->detailsIf<ProcEntityDetails>()}) {
-        if (proc->init() && *proc->init()) {
-          set.emplace(**proc->init());
+    } else {
+      CHECK(dtSym.has<UseDetails>() || dtSym.has<UseErrorDetails>());
+    }
+  } else if (const auto *object{symbol.detailsIf<ObjectEntityDetails>()}) {
+    HarvestArraySpec(object->shape());
+    HarvestArraySpec(object->coshape());
+    if (IsNamedConstant(symbol) || scope.IsDerivedType()) {
+      if (object->init()) {
+        for (SymbolRef ref : evaluate::CollectSymbols(*object->init())) {
+          set.emplace(*ref);
         }
       }
     }
+  } else if (const auto *proc{symbol.detailsIf<ProcEntityDetails>()}) {
+    if (proc->init() && *proc->init() && scope.IsDerivedType()) {
+      set.emplace(**proc->init());
+    }
+  } else if (const auto *subp{symbol.detailsIf<SubprogramDetails>()}) {
+    for (const Symbol *dummy : subp->dummyArgs()) {
+      if (dummy) {
+        HarvestSymbolsNeededFromOtherModules(set, *dummy, scope);
+      }
+    }
+    if (subp->isFunction()) {
+      HarvestSymbolsNeededFromOtherModules(set, subp->result(), scope);
+    }
+  }
+}
+
+static void HarvestSymbolsNeededFromOtherModules(
+    SourceOrderedSymbolSet &set, const Scope &scope) {
+  for (const auto &[_, symbol] : scope) {
+    HarvestSymbolsNeededFromOtherModules(set, *symbol, scope);
   }
 }
 
 void ModFileWriter::PrepareRenamings(const Scope &scope) {
-  SourceOrderedSymbolSet symbolsInInits;
-  HarvestInitializerSymbols(symbolsInInits, scope);
-  for (SymbolRef s : symbolsInInits) {
+  // Identify use-associated symbols already in scope under some name
+  std::map<const Symbol *, const Symbol *> useMap;
+  for (const auto &[name, symbolRef] : scope) {
+    const Symbol *symbol{&*symbolRef};
+    while (const auto *hostAssoc{symbol->detailsIf<HostAssocDetails>()}) {
+      symbol = &hostAssoc->symbol();
+    }
+    if (const auto *use{symbol->detailsIf<UseDetails>()}) {
+      useMap.emplace(&use->symbol(), symbol);
+    }
+  }
+  // Collect symbols needed from other modules
+  SourceOrderedSymbolSet symbolsNeeded;
+  HarvestSymbolsNeededFromOtherModules(symbolsNeeded, scope);
+  // Establish any necessary renamings of symbols in other modules
+  // to their names in this scope, creating those new names when needed.
+  auto &renamings{context_.moduleFileOutputRenamings()};
+  for (SymbolRef s : symbolsNeeded) {
+    if (s->owner().kind() == Scope::Kind::DerivedType) {
+      continue; // component or binding: ok
+    }
     const Scope *sMod{FindModuleContaining(s->owner())};
-    if (!sMod) {
+    if (!sMod || sMod == &scope) {
+      continue;
+    }
+    if (auto iter{useMap.find(&*s)}; iter != useMap.end()) {
+      renamings.emplace(&*s, iter->second->name());
       continue;
     }
     SourceName rename{s->name()};
@@ -272,10 +330,10 @@ void ModFileWriter::PrepareRenamings(const Scope &scope) {
     uses_ << DEREF(sMod->symbol()).name() << ",only:";
     if (rename != s->name()) {
       uses_ << rename << "=>";
+      renamings.emplace(&*s, rename);
     }
     uses_ << s->name() << '\n';
     useExtraAttrs_ << "private::" << rename << '\n';
-    renamings_.emplace(&*s, rename);
   }
 }
 
@@ -283,9 +341,11 @@ void ModFileWriter::PrepareRenamings(const Scope &scope) {
 void ModFileWriter::PutSymbols(const Scope &scope) {
   SymbolVector sorted;
   SymbolVector uses;
+  auto &renamings{context_.moduleFileOutputRenamings()};
+  auto previousRenamings{std::move(renamings)};
   PrepareRenamings(scope);
   UnorderedSymbolSet modules;
-  CollectSymbols(scope, sorted, uses, renamings_, modules);
+  CollectSymbols(scope, sorted, uses, modules);
   // Write module files for dependencies first so that their
   // hashes are known.
   for (auto ref : modules) {
@@ -318,6 +378,7 @@ void ModFileWriter::PutSymbols(const Scope &scope) {
     }
   }
   CHECK(typeBindings.str().empty());
+  renamings = std::move(previousRenamings);
 }
 
 // Emit components in order
@@ -521,7 +582,7 @@ void ModFileWriter::PutDECStructure(
         }
         decls_ << ref->name();
         PutShape(decls_, object->shape(), '(', ')');
-        PutInit(decls_, *ref, object->init(), nullptr, renamings_);
+        PutInit(decls_, *ref, object->init(), nullptr);
         emittedDECFields_.insert(*ref);
       } else if (any) {
         break; // any later use of this structure will use RECORD/str/
@@ -767,8 +828,7 @@ static inline SourceName NameInModuleFile(const Symbol &symbol) {
 // Collect the symbols of this scope sorted by their original order, not name.
 // Generics and namelists are exceptions: they are sorted after other symbols.
 void CollectSymbols(const Scope &scope, SymbolVector &sorted,
-    SymbolVector &uses, std::map<const Symbol *, SourceName> &renamings,
-    UnorderedSymbolSet &modules) {
+    SymbolVector &uses, UnorderedSymbolSet &modules) {
   SymbolVector namelist, generics;
   auto symbols{scope.GetSymbols()};
   std::size_t commonSize{scope.commonBlocks().size()};
@@ -878,8 +938,7 @@ void ModFileWriter::PutObjectEntity(
       getSymbolAttrsToWrite(symbol));
   PutShape(os, details.shape(), '(', ')');
   PutShape(os, details.coshape(), '[', ']');
-  PutInit(os, symbol, details.init(), details.unanalyzedPDTComponentInit(),
-      renamings_);
+  PutInit(os, symbol, details.init(), details.unanalyzedPDTComponentInit());
   os << '\n';
   if (auto tkr{GetIgnoreTKR(symbol)}; !tkr.empty()) {
     os << "!dir$ ignore_tkr(";
@@ -973,25 +1032,12 @@ void ModFileWriter::PutTypeParam(llvm::raw_ostream &os, const Symbol &symbol) {
 }
 
 void PutInit(llvm::raw_ostream &os, const Symbol &symbol, const MaybeExpr &init,
-    const parser::Expr *unanalyzed,
-    const std::map<const Symbol *, SourceName> &renamings) {
+    const parser::Expr *unanalyzed) {
   if (IsNamedConstant(symbol) || symbol.owner().IsDerivedType()) {
     const char *assign{symbol.attrs().test(Attr::POINTER) ? "=>" : "="};
     if (unanalyzed) {
       parser::Unparse(os << assign, *unanalyzed);
     } else if (init) {
-      if (const auto *dtConst{
-              evaluate::UnwrapExpr<evaluate::Constant<evaluate::SomeDerived>>(
-                  *init)}) {
-        const Symbol &dtSym{dtConst->result().derivedTypeSpec().typeSymbol()};
-        if (auto iter{renamings.find(&dtSym)}; iter != renamings.end()) {
-          // Initializer is a constant whose derived type's name has
-          // been brought into scope from a module under a new name
-          // to avoid a conflict.
-          dtConst->AsFortran(os << assign, &iter->second);
-          return;
-        }
-      }
       init->AsFortran(os << assign);
     }
   }
diff --git a/flang/lib/Semantics/mod-file.h b/flang/lib/Semantics/mod-file.h
index 739add32c2e0..be44780bef43 100644
--- a/flang/lib/Semantics/mod-file.h
+++ b/flang/lib/Semantics/mod-file.h
@@ -57,7 +57,6 @@ private:
   llvm::raw_string_ostream decls_{declsBuf_};
   llvm::raw_string_ostream contains_{containsBuf_};
   bool isSubmodule_{false};
-  std::map<const Symbol *, SourceName> renamings_;
 
   void WriteAll(const Scope &);
   void WriteOne(const Scope &);
diff --git a/flang/lib/Semantics/resolve-names-utils.cpp b/flang/lib/Semantics/resolve-names-utils.cpp
index 3ca460b8e46a..e27a54361749 100644
--- a/flang/lib/Semantics/resolve-names-utils.cpp
+++ b/flang/lib/Semantics/resolve-names-utils.cpp
@@ -376,25 +376,35 @@ static void PropagateSaveAttr(const EquivalenceSet &src, EquivalenceSet &dst) {
 
 void EquivalenceSets::AddToSet(const parser::Designator &designator) {
   if (CheckDesignator(designator)) {
-    Symbol &symbol{*currObject_.symbol};
-    if (!currSet_.empty()) {
-      // check this symbol against first of set for compatibility
-      Symbol &first{currSet_.front().symbol};
-      CheckCanEquivalence(designator.source, first, symbol) &&
-          CheckCanEquivalence(designator.source, symbol, first);
-    }
-    auto subscripts{currObject_.subscripts};
-    if (subscripts.empty() && symbol.IsObjectArray()) {
-      // record a whole array as its first element
-      for (const ShapeSpec &spec : symbol.get<ObjectEntityDetails>().shape()) {
-        auto &lbound{spec.lbound().GetExplicit().value()};
-        subscripts.push_back(evaluate::ToInt64(lbound).value());
+    if (Symbol * symbol{currObject_.symbol}) {
+      if (!currSet_.empty()) {
+        // check this symbol against first of set for compatibility
+        Symbol &first{currSet_.front().symbol};
+        CheckCanEquivalence(designator.source, first, *symbol) &&
+            CheckCanEquivalence(designator.source, *symbol, first);
+      }
+      auto subscripts{currObject_.subscripts};
+      if (subscripts.empty()) {
+        if (const ArraySpec * shape{symbol->GetShape()};
+            shape && shape->IsExplicitShape()) {
+          // record a whole array as its first element
+          for (const ShapeSpec &spec : *shape) {
+            if (auto lbound{spec.lbound().GetExplicit()}) {
+              if (auto lbValue{evaluate::ToInt64(*lbound)}) {
+                subscripts.push_back(*lbValue);
+                continue;
+              }
+            }
+            subscripts.clear(); // error recovery
+            break;
+          }
+        }
       }
+      auto substringStart{currObject_.substringStart};
+      currSet_.emplace_back(
+          *symbol, subscripts, substringStart, designator.source);
+      PropagateSaveAttr(currSet_.back(), currSet_);
     }
-    auto substringStart{currObject_.substringStart};
-    currSet_.emplace_back(
-        symbol, subscripts, substringStart, designator.source);
-    PropagateSaveAttr(currSet_.back(), currSet_);
   }
   currObject_ = {};
 }
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index a46c0f378d5d..68cfc8641b9b 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -6227,7 +6227,7 @@ void DeclarationVisitor::CheckEquivalenceSets() {
     }
     for (const parser::EquivalenceObject &object : *set) {
       const auto &designator{object.v.value()};
-      // The designator was not resolved when it was encountered so do it now.
+      // The designator was not resolved when it was encountered, so do it now.
       // AnalyzeExpr causes array sections to be changed to substrings as needed
       Walk(designator);
       if (AnalyzeExpr(context(), designator)) {
@@ -7846,28 +7846,31 @@ bool DeclarationVisitor::CheckForHostAssociatedImplicit(
   if (name.symbol) {
     ApplyImplicitRules(*name.symbol, true);
   }
-  Symbol *hostSymbol;
-  Scope *host{GetHostProcedure()};
-  if (!host || isImplicitNoneType(*host)) {
-    return false;
-  }
-  if (!name.symbol) {
-    hostSymbol = &MakeSymbol(*host, name.source, Attrs{});
-    ConvertToObjectEntity(*hostSymbol);
-    ApplyImplicitRules(*hostSymbol);
-    hostSymbol->set(Symbol::Flag::ImplicitOrError);
-  } else if (name.symbol->test(Symbol::Flag::ImplicitOrError)) {
-    hostSymbol = name.symbol;
-  } else {
-    return false;
-  }
-  Symbol &symbol{MakeHostAssocSymbol(name, *hostSymbol)};
-  if (isImplicitNoneType()) {
-    symbol.get<HostAssocDetails>().implicitOrExplicitTypeError = true;
-  } else {
-    symbol.get<HostAssocDetails>().implicitOrSpecExprError = true;
+  if (Scope * host{GetHostProcedure()}; host && !isImplicitNoneType(*host)) {
+    Symbol *hostSymbol{nullptr};
+    if (!name.symbol) {
+      if (currScope().CanImport(name.source)) {
+        hostSymbol = &MakeSymbol(*host, name.source, Attrs{});
+        ConvertToObjectEntity(*hostSymbol);
+        ApplyImplicitRules(*hostSymbol);
+        hostSymbol->set(Symbol::Flag::ImplicitOrError);
+      }
+    } else if (name.symbol->test(Symbol::Flag::ImplicitOrError)) {
+      hostSymbol = name.symbol;
+    }
+    if (hostSymbol) {
+      Symbol &symbol{MakeHostAssocSymbol(name, *hostSymbol)};
+      if (auto *assoc{symbol.detailsIf<HostAssocDetails>()}) {
+        if (isImplicitNoneType()) {
+          assoc->implicitOrExplicitTypeError = true;
+        } else {
+          assoc->implicitOrSpecExprError = true;
+        }
+        return true;
+      }
+    }
   }
-  return true;
+  return false;
 }
 
 bool DeclarationVisitor::IsUplevelReference(const Symbol &symbol) {
diff --git a/flang/lib/Semantics/symbol.cpp b/flang/lib/Semantics/symbol.cpp
index 381905b89fb2..3eb120fd962f 100644
--- a/flang/lib/Semantics/symbol.cpp
+++ b/flang/lib/Semantics/symbol.cpp
@@ -385,9 +385,17 @@ bool Symbol::IsFuncResult() const {
       details_);
 }
 
+const ArraySpec *Symbol::GetShape() const {
+  if (const auto *details{std::get_if<ObjectEntityDetails>(&details_)}) {
+    return &details->shape();
+  } else {
+    return nullptr;
+  }
+}
+
 bool Symbol::IsObjectArray() const {
-  const auto *details{std::get_if<ObjectEntityDetails>(&details_)};
-  return details && details->IsArray();
+  const ArraySpec *shape{GetShape()};
+  return shape && !shape->empty();
 }
 
 bool Symbol::IsSubprogram() const {
diff --git a/flang/runtime/edit-output.cpp b/flang/runtime/edit-output.cpp
index 6b24c5648318..1a73c85df840 100644
--- a/flang/runtime/edit-output.cpp
+++ b/flang/runtime/edit-output.cpp
@@ -263,7 +263,6 @@ template <int KIND>
 RT_API_ATTRS decimal::ConversionToDecimalResult
 RealOutputEditing<KIND>::ConvertToDecimal(
     int significantDigits, enum decimal::FortranRounding rounding, int flags) {
-#if !defined(RT_DEVICE_COMPILATION)
   auto converted{decimal::ConvertToDecimal<binaryPrecision>(buffer_,
       sizeof buffer_, static_cast<enum decimal::DecimalConversionFlags>(flags),
       significantDigits, rounding, x_)};
@@ -273,10 +272,6 @@ RealOutputEditing<KIND>::ConvertToDecimal(
         sizeof buffer_);
   }
   return converted;
-#else // defined(RT_DEVICE_COMPILATION)
-  // TODO: enable Decimal library build for the device.
-  io_.GetIoErrorHandler().Crash("not implemented yet: decimal conversion");
-#endif // defined(RT_DEVICE_COMPILATION)
 }
 
 static RT_API_ATTRS bool IsInfOrNaN(const char *p, int length) {
diff --git a/flang/runtime/external-unit.cpp b/flang/runtime/external-unit.cpp
index b48549d54587..4bfa218bb776 100644
--- a/flang/runtime/external-unit.cpp
+++ b/flang/runtime/external-unit.cpp
@@ -214,6 +214,13 @@ Iostat ExternalFileUnit::SetDirection(Direction direction) {
     }
   } else {
     if (mayWrite()) {
+      if (direction_ == Direction::Input) {
+        // Don't retain any input data from previous record, like a
+        // variable-length unformatted record footer, in the frame,
+        // since we're going start writing frames.
+        frameOffsetInFile_ += recordOffsetInFrame_;
+        recordOffsetInFrame_ = 0;
+      }
       direction_ = Direction::Output;
       return IostatOk;
     } else {
@@ -332,5 +339,4 @@ bool ExternalFileUnit::Wait(int id) {
 }
 
 } // namespace Fortran::runtime::io
-
 #endif // !defined(RT_USE_PSEUDO_FILE_UNIT)
diff --git a/flang/runtime/numeric.cpp b/flang/runtime/numeric.cpp
index 52b5a56894d8..2225473c4690 100644
--- a/flang/runtime/numeric.cpp
+++ b/flang/runtime/numeric.cpp
@@ -117,13 +117,13 @@ inline RT_API_ATTRS CppTypeFor<TypeCategory::Integer, 4> SelectedIntKind(T x) {
 template <typename T>
 inline RT_API_ATTRS CppTypeFor<TypeCategory::Integer, 4> SelectedLogicalKind(
     T x) {
-  if (x <= 2) {
+  if (x <= 8) {
     return 1;
-  } else if (x <= 4) {
+  } else if (x <= 16) {
     return 2;
-  } else if (x <= 9) {
+  } else if (x <= 32) {
     return 4;
-  } else if (x <= 18) {
+  } else if (x <= 64) {
     return 8;
   }
   return -1;
diff --git a/flang/runtime/terminator.h b/flang/runtime/terminator.h
index 59a47ce93e7c..609f059d6e09 100644
--- a/flang/runtime/terminator.h
+++ b/flang/runtime/terminator.h
@@ -54,7 +54,7 @@ public:
   // to regular printf for the device compilation.
   // Try to keep the inline implementations as small as possible.
   template <typename... Args>
-  [[noreturn]] RT_API_ATTRS const char *Crash(
+  [[noreturn]] RT_DEVICE_NOINLINE RT_API_ATTRS const char *Crash(
       const char *message, Args... args) const {
 #if !defined(RT_DEVICE_COMPILATION)
     // Invoke handler set up by the test harness.
diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp
index 3b42f45d5588..a11f444d8d75 100644
--- a/flang/runtime/unit.cpp
+++ b/flang/runtime/unit.cpp
@@ -265,6 +265,7 @@ void ExternalFileUnit::FinishReadingRecord(IoErrorHandler &handler) {
     furthestPositionInRecord =
         std::max(furthestPositionInRecord, positionInRecord);
     frameOffsetInFile_ += recordOffsetInFrame_ + furthestPositionInRecord;
+    recordOffsetInFrame_ = 0;
   }
   BeginRecord();
 }
diff --git a/flang/test/Driver/fopenmp.f90 b/flang/test/Driver/fopenmp.f90
index c71d34dc9e7e..d70fe100c3d2 100644
--- a/flang/test/Driver/fopenmp.f90
+++ b/flang/test/Driver/fopenmp.f90
@@ -14,7 +14,7 @@
 ! CHECK-FC1-OPENMP: "-fc1"
 ! CHECK-FC1-OPENMP: "-fopenmp"
 !
-! CHECK-WARNING: warning: The library '-fopenmp=={{.*}}' is not supported, openmp is not be enabled
+! CHECK-WARNING: warning: the library '-fopenmp=={{.*}}' is not supported, OpenMP will not be enabled
 ! CHECK-FC1-NO-OPENMP: "-fc1"
 ! CHECK-FC1-NO-OPENMP-NOT: "-fopenmp"
 !
@@ -51,9 +51,14 @@
 ! We'd like to check that the default is sane, but until we have the ability
 ! to *always* semantically analyze OpenMP without always generating runtime
 ! calls (in the event of an unsupported runtime), we don't have a good way to
-! test the CC1 invocation. Instead, just ensure we do eventually link *some*
+! test the FC1 invocation. Instead, just ensure we do eventually link *some*
 ! OpenMP runtime.
 !
+! RUN: %flang -target x86_64-linux-gnu -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
+! RUN: %flang -target x86_64-darwin -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
+! RUN: %flang -target x86_64-freebsd -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANY
+! RUN: %flang -target x86_64-windows-gnu -fopenmp %s -o %t -### 2>&1 | FileCheck %s --check-prefix=CHECK-LD-ANYMD
+!
 ! CHECK-LD-ANY: "{{.*}}ld{{(.exe)?}}"
 ! CHECK-LD-ANY: "-l{{(omp|gomp|iomp5)}}"
 !
diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90
index 6d0e6c3bdcce..b3712db4ac61 100644
--- a/flang/test/Driver/mlir-pass-pipeline.f90
+++ b/flang/test/Driver/mlir-pass-pipeline.f90
@@ -30,8 +30,15 @@ end program
 ! O2-NEXT: CSE
 ! O2-NEXT: (S) {{.*}} num-cse'd
 ! O2-NEXT: (S) {{.*}} num-dce'd
+! O2-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+! O2-NEXT: 'fir.global' Pipeline
+! O2-NEXT:   OptimizedBufferization
 ! O2-NEXT: 'func.func' Pipeline
 ! O2-NEXT:   OptimizedBufferization
+! O2-NEXT: 'omp.declare_reduction' Pipeline
+! O2-NEXT:   OptimizedBufferization
+! O2-NEXT: 'omp.private' Pipeline
+! O2-NEXT:   OptimizedBufferization
 ! ALL: LowerHLFIROrderedAssignments
 ! ALL-NEXT: LowerHLFIRIntrinsics
 ! ALL-NEXT: BufferizeHLFIR
diff --git a/flang/test/Driver/w-arg-unsupported.f90 b/flang/test/Driver/w-arg-unsupported.f90
index 1ef25fdd4db0..be753bfc6784 100644
--- a/flang/test/Driver/w-arg-unsupported.f90
+++ b/flang/test/Driver/w-arg-unsupported.f90
@@ -6,32 +6,32 @@
 ! RUN:        -Wrealloc-lhs -Wrealloc-lhs-all -Wfrontend-loop-interchange -Wtarget-lifetime %s \
 ! RUN:        -c 2>&1 | FileCheck %s
 
-! CHECK: The warning option '-Wextra' is not supported
-! CHECK-NEXT: The warning option '-Waliasing' is not supported
-! CHECK-NEXT: The warning option '-Wampersand' is not supported
-! CHECK-NEXT: The warning option '-Warray-bounds' is not supported
-! CHECK-NEXT: The warning option '-Wc-binding-type' is not supported
-! CHECK-NEXT: The warning option '-Wcharacter-truncation' is not supported
-! CHECK-NEXT: The warning option '-Wconversion' is not supported
-! CHECK-NEXT: The warning option '-Wdo-subscript' is not supported
-! CHECK-NEXT: The warning option '-Wfunction-elimination' is not supported
-! CHECK-NEXT: The warning option '-Wimplicit-interface' is not supported
-! CHECK-NEXT: The warning option '-Wimplicit-procedure' is not supported
-! CHECK-NEXT: The warning option '-Wintrinsic-shadow' is not supported
-! CHECK-NEXT: The warning option '-Wuse-without-only' is not supported
-! CHECK-NEXT: The warning option '-Wintrinsics-std' is not supported
-! CHECK-NEXT: The warning option '-Wline-truncation' is not supported
-! CHECK-NEXT: The warning option '-Wno-align-commons' is not supported
-! CHECK-NEXT: The warning option '-Wno-overwrite-recursive' is not supported
-! CHECK-NEXT: The warning option '-Wno-tabs' is not supported
-! CHECK-NEXT: The warning option '-Wreal-q-constant' is not supported
-! CHECK-NEXT: The warning option '-Wsurprising' is not supported
-! CHECK-NEXT: The warning option '-Wunderflow' is not supported
-! CHECK-NEXT: The warning option '-Wunused-parameter' is not supported
-! CHECK-NEXT: The warning option '-Wrealloc-lhs' is not supported
-! CHECK-NEXT: The warning option '-Wrealloc-lhs-all' is not supported
-! CHECK-NEXT: The warning option '-Wfrontend-loop-interchange' is not supported
-! CHECK-NEXT: The warning option '-Wtarget-lifetime' is not supported
+! CHECK: the warning option '-Wextra' is not supported
+! CHECK-NEXT: the warning option '-Waliasing' is not supported
+! CHECK-NEXT: the warning option '-Wampersand' is not supported
+! CHECK-NEXT: the warning option '-Warray-bounds' is not supported
+! CHECK-NEXT: the warning option '-Wc-binding-type' is not supported
+! CHECK-NEXT: the warning option '-Wcharacter-truncation' is not supported
+! CHECK-NEXT: the warning option '-Wconversion' is not supported
+! CHECK-NEXT: the warning option '-Wdo-subscript' is not supported
+! CHECK-NEXT: the warning option '-Wfunction-elimination' is not supported
+! CHECK-NEXT: the warning option '-Wimplicit-interface' is not supported
+! CHECK-NEXT: the warning option '-Wimplicit-procedure' is not supported
+! CHECK-NEXT: the warning option '-Wintrinsic-shadow' is not supported
+! CHECK-NEXT: the warning option '-Wuse-without-only' is not supported
+! CHECK-NEXT: the warning option '-Wintrinsics-std' is not supported
+! CHECK-NEXT: the warning option '-Wline-truncation' is not supported
+! CHECK-NEXT: the warning option '-Wno-align-commons' is not supported
+! CHECK-NEXT: the warning option '-Wno-overwrite-recursive' is not supported
+! CHECK-NEXT: the warning option '-Wno-tabs' is not supported
+! CHECK-NEXT: the warning option '-Wreal-q-constant' is not supported
+! CHECK-NEXT: the warning option '-Wsurprising' is not supported
+! CHECK-NEXT: the warning option '-Wunderflow' is not supported
+! CHECK-NEXT: the warning option '-Wunused-parameter' is not supported
+! CHECK-NEXT: the warning option '-Wrealloc-lhs' is not supported
+! CHECK-NEXT: the warning option '-Wrealloc-lhs-all' is not supported
+! CHECK-NEXT: the warning option '-Wfrontend-loop-interchange' is not supported
+! CHECK-NEXT: the warning option '-Wtarget-lifetime' is not supported
 
 program m
 end program
diff --git a/flang/test/Driver/wextra-ok.f90 b/flang/test/Driver/wextra-ok.f90
index 48676e8e62aa..6a38d9481a36 100644
--- a/flang/test/Driver/wextra-ok.f90
+++ b/flang/test/Driver/wextra-ok.f90
@@ -4,7 +4,7 @@
 ! RUN: %flang -std=f2018 -Wextra %s -c 2>&1 | FileCheck %s --check-prefix=CHECK-OK
 ! RUN: not %flang -std=f2018 -Wblah -Wextra %s -c 2>&1 | FileCheck %s --check-prefix=WRONG
 
-! CHECK-OK: The warning option '-Wextra' is not supported
+! CHECK-OK: the warning option '-Wextra' is not supported
 ! WRONG: Only `-Werror` is supported currently.
 
 program wextra_ok
diff --git a/flang/test/Evaluate/triplets01.f90 b/flang/test/Evaluate/triplets01.f90
new file mode 100644
index 000000000000..aba9772f6b95
--- /dev/null
+++ b/flang/test/Evaluate/triplets01.f90
@@ -0,0 +1,11 @@
+! RUN: %python %S/test_folding.py %s %flang_fc1
+module m
+  logical, parameter :: test01 = all([1:10:2] == [(j, j=1,10,2)])
+  logical, parameter :: test02 = kind([1:20:2]) == kind(1)
+  logical, parameter :: test03 = all([10:1:-3,123] == [(j, j=10,1,-3),123])
+  logical, parameter :: test04 = kind([10:1:-3,123]) == kind(1)
+  logical, parameter :: test05 = kind([10_2:1_2:-3_2,123_2]) == 2
+  logical, parameter :: test06 = all([10_2:1_2:-3_2,123_2] == [(j, integer(2)::j=10,1,-3),123_2])
+  logical, parameter :: test07 = kind([10_2:1_4:-3_2]) == 4
+  logical, parameter :: test08 = kind([10_2:1_4]) == 4
+end
diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index 42bceb66668d..db252c4adfd4 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -34,7 +34,14 @@ func.func @_QQmain() {
 // PASSES-NEXT:   CSE
 // PASSES-NEXT:    (S) 0 num-cse'd - Number of operations CSE'd
 // PASSES-NEXT:    (S) 0 num-dce'd - Number of operations DCE'd
-// PASSES-NEXT:   'func.func' Pipeline
+// PASSES-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private']
+// PASSES-NEXT: 'fir.global' Pipeline
+// PASSES-NEXT:    OptimizedBufferization
+// PASSES-NEXT: 'func.func' Pipeline
+// PASSES-NEXT:    OptimizedBufferization
+// PASSES-NEXT: 'omp.declare_reduction' Pipeline
+// PASSES-NEXT:    OptimizedBufferization
+// PASSES-NEXT: 'omp.private' Pipeline
 // PASSES-NEXT:    OptimizedBufferization
 // PASSES-NEXT:   LowerHLFIROrderedAssignments
 // PASSES-NEXT:   LowerHLFIRIntrinsics
diff --git a/flang/test/Integration/debug-complex-1.f90 b/flang/test/Integration/debug-complex-1.f90
new file mode 100644
index 000000000000..c8d0da4c4baa
--- /dev/null
+++ b/flang/test/Integration/debug-complex-1.f90
@@ -0,0 +1,26 @@
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck  %s
+
+program mn
+  complex(kind=4) :: c4
+  complex(kind=8) :: c8
+  complex(kind=16) :: r
+  r = fn1(c4, c8)
+  print *, r
+contains
+  function fn1(a, b) result (c)
+    complex(kind=4), intent(in) :: a
+    complex(kind=8), intent(in) :: b
+    complex(kind=16) :: c
+    c = a + b
+  end function
+end program
+
+! CHECK-DAG: ![[C4:.*]] = !DIBasicType(name: "complex", size: 64, encoding: DW_ATE_complex_float)
+! CHECK-DAG: ![[C8:.*]] = !DIBasicType(name: "complex", size: 128, encoding: DW_ATE_complex_float)
+! CHECK-DAG: ![[C16:.*]] = !DIBasicType(name: "complex", size: 256, encoding: DW_ATE_complex_float)
+! CHECK-DAG: !DILocalVariable(name: "c4"{{.*}}type: ![[C4]])
+! CHECK-DAG: !DILocalVariable(name: "c8"{{.*}}type: ![[C8]])
+! CHECK-DAG: !DILocalVariable(name: "r"{{.*}}type: ![[C16]])
+! CHECK-DAG: !DILocalVariable(name: "a"{{.*}}type: ![[C4]])
+! CHECK-DAG: !DILocalVariable(name: "b"{{.*}}type: ![[C8]])
+! CHECK-DAG: !DILocalVariable(name: "c"{{.*}}type: ![[C16]])
diff --git a/flang/test/Integration/debug-fixed-array-type-2.f90 b/flang/test/Integration/debug-fixed-array-type-2.f90
new file mode 100644
index 000000000000..315525442a5b
--- /dev/null
+++ b/flang/test/Integration/debug-fixed-array-type-2.f90
@@ -0,0 +1,43 @@
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck %s
+
+program mn
+
+  integer d1(3)
+  integer d2(2, 5)
+  real d3(6, 8, 7)
+
+  i8 = fn1(d1, d2, d3)
+contains
+  function fn1(a1, b1, c1) result (res)
+    integer a1(3)
+    integer b1(2, 5)
+    real c1(6, 8, 7)
+    integer res
+    res = a1(1) + b1(1,2) + c1(3, 3, 4)
+  end function
+
+end program
+
+! CHECK-DAG: ![[INT:.*]] = !DIBasicType(name: "integer", size: 32, encoding: DW_ATE_signed)
+! CHECK-DAG: ![[REAL:.*]] = !DIBasicType(name: "real", size: 32, encoding: DW_ATE_float)
+! CHECK-DAG: ![[R1:.*]] = !DISubrange(count: 3, lowerBound: 1)
+! CHECK-DAG: ![[SUB1:.*]] = !{![[R1]]}
+! CHECK-DAG: ![[D1TY:.*]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[INT]], elements: ![[SUB1]])
+! CHECK-DAG: !DILocalVariable(name: "d1"{{.*}}type: ![[D1TY]])
+
+! CHECK-DAG: ![[R21:.*]] = !DISubrange(count: 2, lowerBound: 1)
+! CHECK-DAG: ![[R22:.*]] = !DISubrange(count: 5, lowerBound: 1)
+! CHECK-DAG: ![[SUB2:.*]] = !{![[R21]], ![[R22]]}
+! CHECK-DAG: ![[D2TY:.*]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[INT]], elements: ![[SUB2]])
+! CHECK-DAG: !DILocalVariable(name: "d2"{{.*}}type: ![[D2TY]])
+
+! CHECK-DAG: ![[R31:.*]] = !DISubrange(count: 6, lowerBound: 1)
+! CHECK-DAG: ![[R32:.*]] = !DISubrange(count: 8, lowerBound: 1)
+! CHECK-DAG: ![[R33:.*]] = !DISubrange(count: 7, lowerBound: 1)
+! CHECK-DAG: ![[SUB3:.*]] = !{![[R31]], ![[R32]], ![[R33]]}
+! CHECK-DAG: ![[D3TY:.*]] = !DICompositeType(tag: DW_TAG_array_type, baseType: ![[REAL]], elements: ![[SUB3]])
+! CHECK-DAG: !DILocalVariable(name: "d3"{{.*}}type: ![[D3TY]])
+
+! CHECK-DAG: !DILocalVariable(name: "a1", arg: 1{{.*}}type: ![[D1TY]])
+! CHECK-DAG: !DILocalVariable(name: "b1", arg: 2{{.*}}type: ![[D2TY]])
+! CHECK-DAG: !DILocalVariable(name: "c1", arg: 3{{.*}}type: ![[D3TY]])
diff --git a/flang/test/Integration/debug-module-2.f90 b/flang/test/Integration/debug-module-2.f90
new file mode 100644
index 000000000000..60fccaa2a6c1
--- /dev/null
+++ b/flang/test/Integration/debug-module-2.f90
@@ -0,0 +1,39 @@
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=line-tables-only %s -o - | FileCheck --check-prefix=LINEONLY %s
+
+! CHECK-DAG: ![[FILE:.*]] = !DIFile(filename: {{.*}}debug-module-2.f90{{.*}})
+! CHECK-DAG: ![[FILE2:.*]] = !DIFile(filename: {{.*}}debug-module-2.f90{{.*}})
+! CHECK-DAG: ![[CU:.*]] = distinct !DICompileUnit({{.*}}file: ![[FILE]]{{.*}} globals: ![[GLOBALS:.*]])
+! CHECK-DAG: ![[MOD:.*]] = !DIModule(scope: ![[CU]], name: "helper", file: ![[FILE]]{{.*}})
+! CHECK-DAG: ![[R4:.*]] = !DIBasicType(name: "real", size: 32, encoding: DW_ATE_float)
+! CHECK-DAG: ![[I4:.*]] = !DIBasicType(name: "integer", size: 32, encoding: DW_ATE_signed)
+module helper
+! CHECK-DAG: ![[GLR:.*]] = distinct !DIGlobalVariable(name: "glr", linkageName: "_QMhelperEglr", scope: ![[MOD]], file: ![[FILE]], line: [[@LINE+2]], type: ![[R4]], isLocal: false, isDefinition: true)
+! CHECK-DAG: ![[GLRX:.*]] = !DIGlobalVariableExpression(var: ![[GLR]], expr: !DIExpression())
+  real glr
+
+! CHECK-DAG: ![[GLI:.*]] = distinct !DIGlobalVariable(name: "gli", linkageName: "_QMhelperEgli", scope: ![[MOD]], file: ![[FILE]], line: [[@LINE+2]], type: ![[I4]], isLocal: false, isDefinition: true)
+! CHECK-DAG: ![[GLIX:.*]] = !DIGlobalVariableExpression(var: ![[GLI]], expr: !DIExpression())
+  integer gli
+
+  contains
+!CHECK-DAG: !DISubprogram(name: "test", linkageName: "_QMhelperPtest", scope: ![[MOD]], file: ![[FILE2]], line: [[@LINE+1]]{{.*}}unit: ![[CU]])
+    subroutine test()
+    glr = 12.34
+    gli = 67
+
+    end subroutine
+end module helper
+
+program test
+use helper
+implicit none
+
+  glr = 3.14
+  gli = 2
+  call test()
+
+end program test
+
+! CHECK-DAG: ![[GLOBALS]] = !{![[GLIX]], ![[GLRX]]}
+! LINEONLY-NOT: DIGlobalVariable
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
index 084314ed63ec..42fa4d09c95e 100644
--- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -25,6 +25,8 @@ subroutine sub1()
 
   adev = ahost + bhost
 
+  adev = 10
+
 end
 
 ! CHECK-LABEL: func.func @_QPsub1()
@@ -41,10 +43,7 @@ end
 ! CHECK: cuf.data_transfer %[[ASSOC]]#0 to %[[M]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<i32>, !fir.ref<i32>
 ! CHECK: hlfir.end_associate %[[ASSOC]]#1, %[[ASSOC]]#2 : !fir.ref<i32>, i1
 
-! CHECK: %[[C1:.*]] = arith.constant 1 : i32
-! CHECK: %[[ASSOC:.*]]:3 = hlfir.associate %[[C1]] {uniq_name = ".cuf_host_tmp"} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
-! CHECK: cuf.data_transfer %[[ASSOC]]#0 to %[[M]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<i32>, !fir.ref<i32>
-! CHECK: hlfir.end_associate %[[ASSOC]]#1, %[[ASSOC]]#2 : !fir.ref<i32>, i1
+! CHECK: cuf.data_transfer %c1{{.*}} to %[[M]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : i32, !fir.ref<i32>
 
 ! CHECK: cuf.data_transfer %[[AHOST]]#0 to %[[ADEV]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
 
@@ -62,6 +61,8 @@ end
 ! CHECK: cuf.data_transfer %[[ASSOC]]#0 to %[[ADEV]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
 ! CHECK: hlfir.end_associate %[[ASSOC]]#1, %[[ASSOC]]#2 : !fir.ref<!fir.array<10xi32>>, i1
 
+! CHECK: cuf.data_transfer %c10{{.*}} to %[[ADEV]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : i32, !fir.ref<!fir.array<10xi32>>
+
 subroutine sub2()
   integer, device :: m
   integer, device :: adev(10), bdev(10)
@@ -159,3 +160,22 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPsub6
 ! CHECK: cuf.data_transfer
+
+subroutine sub7(a, b, c)
+  integer, device, allocatable :: a(:), c(:)
+  integer, allocatable :: b(:)
+  b = a
+
+  a = b
+
+  c = a
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub7(
+! CHECK-SAME:  %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {fir.bindc_name = "b"}, %[[ARG2:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "c"}) {
+! CHECK: %[[A:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+! CHECK: %[[B:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{.*}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Eb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+! CHECK: %[[C:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Ec"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+! CHECK: cuf.data_transfer %[[A]]#0 to %[[B]]#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: cuf.data_transfer %[[B]]#0 to %[[A]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: cuf.data_transfer %[[A]]#0 to %[[C]]#0 {transfer_kind = #cuf.cuda_transfer<device_device>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
diff --git a/flang/test/Lower/Intrinsics/selected_char_kind.f90 b/flang/test/Lower/Intrinsics/selected_char_kind.f90
new file mode 100644
index 000000000000..4012591f2286
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/selected_char_kind.f90
@@ -0,0 +1,17 @@
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
+
+subroutine selected_char_kind_test(c)
+  character(*) :: c
+  integer :: res
+  res = selected_char_kind(c)
+end
+
+! CHECK-LABEL: func.func @_QPselected_char_kind_test(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "c"})
+! CHECK: %[[UNBOXCHAR:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+! CHECK: %[[C:.*]]:2 = hlfir.declare %[[UNBOXCHAR]]#0 typeparams %[[UNBOXCHAR]]#1 dummy_scope %0 {uniq_name = "_QFselected_char_kind_testEc"} : (!fir.ref<!fir.char<1,?>>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK: %[[RES_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFselected_char_kind_testEres"}
+! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_char_kind_testEres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[CHAR_PTR:.*]] = fir.convert %[[C]]#1 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
+! CHECK: %[[CHAR_LEN:.*]] = fir.convert %[[UNBOXCHAR]]#1 : (index) -> i64
+! CHECK: %{{.*}} = fir.call @_FortranASelectedCharKind(%{{.*}}, %{{.*}}, %[[CHAR_PTR]], %[[CHAR_LEN]]) fastmath<contract> : (!fir.ref<i8>, i32, !fir.ref<i8>, i64) -> i32
diff --git a/flang/test/Lower/Intrinsics/selected_logical_kind.f90 b/flang/test/Lower/Intrinsics/selected_logical_kind.f90
new file mode 100644
index 000000000000..93952762cce5
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/selected_logical_kind.f90
@@ -0,0 +1,71 @@
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
+
+subroutine selected_logical_kind_test1(input)
+  integer(1) :: input, res
+  res = selected_logical_kind(input)
+end
+
+! CHECK-LABEL: func.func @_QPselected_logical_kind_test1(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i8> {fir.bindc_name = "input"})
+! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {uniq_name = "_QFselected_logical_kind_test1Einput"} : (!fir.ref<i8>, !fir.dscope) -> (!fir.ref<i8>, !fir.ref<i8>)
+! CHECK: %[[RES_ALLOCA:.*]] = fir.alloca i8 {bindc_name = "res", uniq_name = "_QFselected_logical_kind_test1Eres"}
+! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_logical_kind_test1Eres"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
+! CHECK: %[[KIND:.*]] = arith.constant 1 : i32
+! CHECK: %[[INPUT_ADDR:.*]] = fir.convert %1#1 : (!fir.ref<i8>) -> !fir.llvm_ptr<i8>
+! CHECK: %{{.*}} = fir.call @_FortranASelectedLogicalKind(%{{.*}}, %{{.*}}, %[[INPUT_ADDR]], %[[KIND]]) fastmath<contract> : (!fir.ref<i8>, i32, !fir.llvm_ptr<i8>, i32) -> i32
+
+subroutine selected_logical_kind_test2(input)
+  integer(2) :: input, res
+  res = selected_logical_kind(input)
+end
+
+! CHECK-LABEL: func.func @_QPselected_logical_kind_test2(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i16> {fir.bindc_name = "input"})
+! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {uniq_name = "_QFselected_logical_kind_test2Einput"} : (!fir.ref<i16>, !fir.dscope) -> (!fir.ref<i16>, !fir.ref<i16>)
+! CHECK: %[[RES_ALLOCA:.*]] = fir.alloca i16 {bindc_name = "res", uniq_name = "_QFselected_logical_kind_test2Eres"}
+! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_logical_kind_test2Eres"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
+! CHECK: %[[KIND:.*]] = arith.constant 2 : i32
+! CHECK: %[[INPUT_ADDR:.*]] = fir.convert %[[INPUT]]#1 : (!fir.ref<i16>) -> !fir.llvm_ptr<i8>
+! CHECK: %{{.*}} = fir.call @_FortranASelectedLogicalKind(%{{.*}}, %{{.*}}, %[[INPUT_ADDR]], %[[KIND]]) fastmath<contract> : (!fir.ref<i8>, i32, !fir.llvm_ptr<i8>, i32) -> i32
+
+subroutine selected_logical_kind_test4(input)
+  integer(4) :: input, res
+  res = selected_logical_kind(input)
+end
+
+! CHECK-LABEL: func.func @_QPselected_logical_kind_test4(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "input"})
+! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFselected_logical_kind_test4Einput"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[RES_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFselected_logical_kind_test4Eres"}
+! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_logical_kind_test4Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[KIND:.*]] = arith.constant 4 : i32
+! CHECK: %[[INPUT_ADDR:.*]] = fir.convert %[[INPUT]]#1 : (!fir.ref<i32>) -> !fir.llvm_ptr<i8>
+! CHECK: %{{.*}} = fir.call @_FortranASelectedLogicalKind(%{{.*}}, %{{.*}}, %[[INPUT_ADDR]], %[[KIND]]) fastmath<contract> : (!fir.ref<i8>, i32, !fir.llvm_ptr<i8>, i32) -> i32
+
+subroutine selected_logical_kind_test8(input)
+  integer(8) :: input, res
+  res = selected_logical_kind(input)
+end
+
+! CHECK-LABEL: func.func @_QPselected_logical_kind_test8(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i64> {fir.bindc_name = "input"})
+! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {uniq_name = "_QFselected_logical_kind_test8Einput"} : (!fir.ref<i64>, !fir.dscope) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[RES_ALLOCA]] = fir.alloca i64 {bindc_name = "res", uniq_name = "_QFselected_logical_kind_test8Eres"}
+! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_logical_kind_test8Eres"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK: %[[KIND:.*]] = arith.constant 8 : i32
+! CHECK: %[[INPUT_ADDR:.*]] = fir.convert %[[INPUT]]#1 : (!fir.ref<i64>) -> !fir.llvm_ptr<i8>
+! CHECK: %{{.*}} = fir.call @_FortranASelectedLogicalKind(%{{.*}}, %{{.*}}, %[[INPUT_ADDR]], %[[KIND]]) fastmath<contract> : (!fir.ref<i8>, i32, !fir.llvm_ptr<i8>, i32) -> i32
+
+subroutine selected_logical_kind_test16(input)
+  integer(16) :: input, res
+  res = selected_logical_kind(input)
+end
+
+! CHECK-LABEL: func.func @_QPselected_logical_kind_test16(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i128> {fir.bindc_name = "input"})
+! CHECK: %[[INPUT:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} {uniq_name = "_QFselected_logical_kind_test16Einput"} : (!fir.ref<i128>, !fir.dscope) -> (!fir.ref<i128>, !fir.ref<i128>)
+! CHECK: %[[RES_ALLOCA:.*]] = fir.alloca i128 {bindc_name = "res", uniq_name = "_QFselected_logical_kind_test16Eres"}
+! CHECK: %[[RES:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {uniq_name = "_QFselected_logical_kind_test16Eres"} : (!fir.ref<i128>) -> (!fir.ref<i128>, !fir.ref<i128>)
+! CHECK: %[[KIND:.*]] = arith.constant 16 : i32
+! CHECK: %[[INPUT_ADDR:.*]] = fir.convert %[[INPUT]]#1 : (!fir.ref<i128>) -> !fir.llvm_ptr<i8>
+! CHECK: %{{.*}} = fir.call @_FortranASelectedLogicalKind(%{{.*}}, %{{.*}}, %[[INPUT_ADDR]], %[[KIND]]) fastmath<contract> : (!fir.ref<i8>, i32, !fir.llvm_ptr<i8>, i32) -> i32
diff --git a/flang/test/Lower/OpenMP/invalid-reduction-modifier.f90 b/flang/test/Lower/OpenMP/invalid-reduction-modifier.f90
index 53871276761f..b3e87df7086e 100644
--- a/flang/test/Lower/OpenMP/invalid-reduction-modifier.f90
+++ b/flang/test/Lower/OpenMP/invalid-reduction-modifier.f90
@@ -1,6 +1,4 @@
-!Remove the --crash below once we can diagnose the issue more gracefully.
-!REQUIRES: asserts
-!RUN: not --crash %flang_fc1 -fopenmp -emit-hlfir -o - %s
+!RUN: not %flang_fc1 -fopenmp -emit-hlfir -o - %s
 
 ! Check that we reject the "task" reduction modifier on the "simd" directive.
 
diff --git a/flang/test/Semantics/OpenMP/allocate-clause01.f90 b/flang/test/Semantics/OpenMP/allocate-clause01.f90
index 486166ec6338..2b9a72e928eb 100644
--- a/flang/test/Semantics/OpenMP/allocate-clause01.f90
+++ b/flang/test/Semantics/OpenMP/allocate-clause01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocate-directive.f90 b/flang/test/Semantics/OpenMP/allocate-directive.f90
index f55b724980fb..18a14b825f00 100644
--- a/flang/test/Semantics/OpenMP/allocate-directive.f90
+++ b/flang/test/Semantics/OpenMP/allocate-directive.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocate01.f90 b/flang/test/Semantics/OpenMP/allocate01.f90
index a3d5fb5f90cd..6ccb8bb09e83 100644
--- a/flang/test/Semantics/OpenMP/allocate01.f90
+++ b/flang/test/Semantics/OpenMP/allocate01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocate02.f90 b/flang/test/Semantics/OpenMP/allocate02.f90
index b9bfdbe55aa2..8f0579e810bb 100644
--- a/flang/test/Semantics/OpenMP/allocate02.f90
+++ b/flang/test/Semantics/OpenMP/allocate02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocate03.f90 b/flang/test/Semantics/OpenMP/allocate03.f90
index ce577c857985..e35115f3897c 100644
--- a/flang/test/Semantics/OpenMP/allocate03.f90
+++ b/flang/test/Semantics/OpenMP/allocate03.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocate04.f90 b/flang/test/Semantics/OpenMP/allocate04.f90
index 37f180cc16aa..ea89d9446cc1 100644
--- a/flang/test/Semantics/OpenMP/allocate04.f90
+++ b/flang/test/Semantics/OpenMP/allocate04.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocate05.f90 b/flang/test/Semantics/OpenMP/allocate05.f90
index c4e0ace988bd..a787e8bb32a4 100644
--- a/flang/test/Semantics/OpenMP/allocate05.f90
+++ b/flang/test/Semantics/OpenMP/allocate05.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocate06.f90 b/flang/test/Semantics/OpenMP/allocate06.f90
index e25b4c4decd5..e14134cd0730 100644
--- a/flang/test/Semantics/OpenMP/allocate06.f90
+++ b/flang/test/Semantics/OpenMP/allocate06.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocate07.f90 b/flang/test/Semantics/OpenMP/allocate07.f90
index 2b0f17647b3c..396df598b252 100644
--- a/flang/test/Semantics/OpenMP/allocate07.f90
+++ b/flang/test/Semantics/OpenMP/allocate07.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocate08.f90 b/flang/test/Semantics/OpenMP/allocate08.f90
index 82aa11d69cfc..fc950ea4fca3 100644
--- a/flang/test/Semantics/OpenMP/allocate08.f90
+++ b/flang/test/Semantics/OpenMP/allocate08.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocate09.f90 b/flang/test/Semantics/OpenMP/allocate09.f90
index 3664c34c7e43..0f93a340fe1e 100644
--- a/flang/test/Semantics/OpenMP/allocate09.f90
+++ b/flang/test/Semantics/OpenMP/allocate09.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocators01.f90 b/flang/test/Semantics/OpenMP/allocators01.f90
index f10db35f96d9..c75c522ecae1 100644
--- a/flang/test/Semantics/OpenMP/allocators01.f90
+++ b/flang/test/Semantics/OpenMP/allocators01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocators02.f90 b/flang/test/Semantics/OpenMP/allocators02.f90
index 7f8fa3600277..8055d21c6809 100644
--- a/flang/test/Semantics/OpenMP/allocators02.f90
+++ b/flang/test/Semantics/OpenMP/allocators02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocators03.f90 b/flang/test/Semantics/OpenMP/allocators03.f90
index 050cc2051c99..03cff1b1e991 100644
--- a/flang/test/Semantics/OpenMP/allocators03.f90
+++ b/flang/test/Semantics/OpenMP/allocators03.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocators04.f90 b/flang/test/Semantics/OpenMP/allocators04.f90
index 3c84030c4e39..1d2e96443a9d 100644
--- a/flang/test/Semantics/OpenMP/allocators04.f90
+++ b/flang/test/Semantics/OpenMP/allocators04.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocators05.f90 b/flang/test/Semantics/OpenMP/allocators05.f90
index 8fd80b033756..d0e11ca5874d 100644
--- a/flang/test/Semantics/OpenMP/allocators05.f90
+++ b/flang/test/Semantics/OpenMP/allocators05.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/allocators06.f90 b/flang/test/Semantics/OpenMP/allocators06.f90
index 881182caa9b3..a975204c1133 100644
--- a/flang/test/Semantics/OpenMP/allocators06.f90
+++ b/flang/test/Semantics/OpenMP/allocators06.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/atomic-hint-clause.f90 b/flang/test/Semantics/OpenMP/atomic-hint-clause.f90
index 9050cbb0dca6..e157b7e1e73a 100644
--- a/flang/test/Semantics/OpenMP/atomic-hint-clause.f90
+++ b/flang/test/Semantics/OpenMP/atomic-hint-clause.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags 
diff --git a/flang/test/Semantics/OpenMP/atomic.f90 b/flang/test/Semantics/OpenMP/atomic.f90
index 2f270ce33338..44f06b7460bf 100644
--- a/flang/test/Semantics/OpenMP/atomic.f90
+++ b/flang/test/Semantics/OpenMP/atomic.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 use omp_lib
 ! Check OpenMP 2.13.6 atomic Construct
diff --git a/flang/test/Semantics/OpenMP/atomic01.f90 b/flang/test/Semantics/OpenMP/atomic01.f90
index 6ec94f3ff3a4..f0e1b47d2fa1 100644
--- a/flang/test/Semantics/OpenMP/atomic01.f90
+++ b/flang/test/Semantics/OpenMP/atomic01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/atomic02.f90 b/flang/test/Semantics/OpenMP/atomic02.f90
index 92f2c4b9d040..b823bc4c33b2 100644
--- a/flang/test/Semantics/OpenMP/atomic02.f90
+++ b/flang/test/Semantics/OpenMP/atomic02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/atomic03.f90 b/flang/test/Semantics/OpenMP/atomic03.f90
index 4cce71dba351..76367495b986 100644
--- a/flang/test/Semantics/OpenMP/atomic03.f90
+++ b/flang/test/Semantics/OpenMP/atomic03.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/atomic04.f90 b/flang/test/Semantics/OpenMP/atomic04.f90
index c03b230c837a..a9644ad95aa3 100644
--- a/flang/test/Semantics/OpenMP/atomic04.f90
+++ b/flang/test/Semantics/OpenMP/atomic04.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/atomic05.f90 b/flang/test/Semantics/OpenMP/atomic05.f90
index cfba33968213..2d9566463309 100644
--- a/flang/test/Semantics/OpenMP/atomic05.f90
+++ b/flang/test/Semantics/OpenMP/atomic05.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/barrier.f90 b/flang/test/Semantics/OpenMP/barrier.f90
index 5fc3f7f3bd70..1483fbd08f95 100644
--- a/flang/test/Semantics/OpenMP/barrier.f90
+++ b/flang/test/Semantics/OpenMP/barrier.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 
 !$omp barrier
diff --git a/flang/test/Semantics/OpenMP/clause-validity01.f90 b/flang/test/Semantics/OpenMP/clause-validity01.f90
index 779be00b9eba..22ac57065ffe 100644
--- a/flang/test/Semantics/OpenMP/clause-validity01.f90
+++ b/flang/test/Semantics/OpenMP/clause-validity01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags %openmp_module_flag
diff --git a/flang/test/Semantics/OpenMP/combined-constructs.f90 b/flang/test/Semantics/OpenMP/combined-constructs.f90
index ba504d1b8e22..35ab6fcac58b 100644
--- a/flang/test/Semantics/OpenMP/combined-constructs.f90
+++ b/flang/test/Semantics/OpenMP/combined-constructs.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 
 program main
diff --git a/flang/test/Semantics/OpenMP/common-block.f90 b/flang/test/Semantics/OpenMP/common-block.f90
index 4ddc5474a628..e1ddd120da85 100644
--- a/flang/test/Semantics/OpenMP/common-block.f90
+++ b/flang/test/Semantics/OpenMP/common-block.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %flang_fc1 -fopenmp -fdebug-dump-symbols %s | FileCheck %s
 
 program main
diff --git a/flang/test/Semantics/OpenMP/compiler-directive.f90 b/flang/test/Semantics/OpenMP/compiler-directive.f90
index 07363ac5ac1e..5d3e9bae27fd 100644
--- a/flang/test/Semantics/OpenMP/compiler-directive.f90
+++ b/flang/test/Semantics/OpenMP/compiler-directive.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! CompilerDirective with openmp tests
 
diff --git a/flang/test/Semantics/OpenMP/copyin01.f90 b/flang/test/Semantics/OpenMP/copyin01.f90
index 387a9fc7cf0b..0051b5d441f0 100644
--- a/flang/test/Semantics/OpenMP/copyin01.f90
+++ b/flang/test/Semantics/OpenMP/copyin01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 4.5
 ! 2.15.4.1 copyin Clause
diff --git a/flang/test/Semantics/OpenMP/copyin02.f90 b/flang/test/Semantics/OpenMP/copyin02.f90
index 92512890e3ed..09b876677ea3 100644
--- a/flang/test/Semantics/OpenMP/copyin02.f90
+++ b/flang/test/Semantics/OpenMP/copyin02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 4.5
 ! 2.15.4.1 copyin Clause
diff --git a/flang/test/Semantics/OpenMP/copyin03.f90 b/flang/test/Semantics/OpenMP/copyin03.f90
index 5c0a2e873d81..7c3759aa2e11 100644
--- a/flang/test/Semantics/OpenMP/copyin03.f90
+++ b/flang/test/Semantics/OpenMP/copyin03.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 4.5
 ! 2.15.4.1 copyin Clause
diff --git a/flang/test/Semantics/OpenMP/copyin04.f90 b/flang/test/Semantics/OpenMP/copyin04.f90
index 7cbee5f4afab..6f5e8dfef217 100644
--- a/flang/test/Semantics/OpenMP/copyin04.f90
+++ b/flang/test/Semantics/OpenMP/copyin04.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 4.5
 ! 2.15.4.1 copyin Clause
diff --git a/flang/test/Semantics/OpenMP/copyin05.f90 b/flang/test/Semantics/OpenMP/copyin05.f90
index aec6a7f88070..142d5a7345c6 100644
--- a/flang/test/Semantics/OpenMP/copyin05.f90
+++ b/flang/test/Semantics/OpenMP/copyin05.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 4.5
 ! 2.15.4.1 copyin Clause
diff --git a/flang/test/Semantics/OpenMP/copying.f90 b/flang/test/Semantics/OpenMP/copying.f90
index d56d2b8932cf..63fb39a0f26e 100644
--- a/flang/test/Semantics/OpenMP/copying.f90
+++ b/flang/test/Semantics/OpenMP/copying.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp -Werror -pedantic
 ! OpenMP Version 5.0
 ! 2.19.4.4 firstprivate Clause
diff --git a/flang/test/Semantics/OpenMP/copyprivate01.f90 b/flang/test/Semantics/OpenMP/copyprivate01.f90
index 4920d7abbe7c..d5cf27347607 100644
--- a/flang/test/Semantics/OpenMP/copyprivate01.f90
+++ b/flang/test/Semantics/OpenMP/copyprivate01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 4.5
 ! 2.15.4.2 copyprivate Clause
diff --git a/flang/test/Semantics/OpenMP/copyprivate02.f90 b/flang/test/Semantics/OpenMP/copyprivate02.f90
index 2157cd4cb558..35fd6dddd20c 100644
--- a/flang/test/Semantics/OpenMP/copyprivate02.f90
+++ b/flang/test/Semantics/OpenMP/copyprivate02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 4.5
 ! 2.15.4.2 copyprivate Clause
diff --git a/flang/test/Semantics/OpenMP/copyprivate03.f90 b/flang/test/Semantics/OpenMP/copyprivate03.f90
index f1433ced8aac..9d39fdb6b13c 100644
--- a/flang/test/Semantics/OpenMP/copyprivate03.f90
+++ b/flang/test/Semantics/OpenMP/copyprivate03.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 4.5
 ! 2.15.4.2 copyprivate Clause
diff --git a/flang/test/Semantics/OpenMP/critical-empty.f90 b/flang/test/Semantics/OpenMP/critical-empty.f90
index 706f6d806f55..2001c8a14a7b 100644
--- a/flang/test/Semantics/OpenMP/critical-empty.f90
+++ b/flang/test/Semantics/OpenMP/critical-empty.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp 
 ! Test that there are no errors for an empty critical construct
 
diff --git a/flang/test/Semantics/OpenMP/critical-hint-clause.f90 b/flang/test/Semantics/OpenMP/critical-hint-clause.f90
index d737d671973c..419187fa3bbf 100644
--- a/flang/test/Semantics/OpenMP/critical-hint-clause.f90
+++ b/flang/test/Semantics/OpenMP/critical-hint-clause.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags 
diff --git a/flang/test/Semantics/OpenMP/do02.f90 b/flang/test/Semantics/OpenMP/do02.f90
new file mode 100644
index 000000000000..d9f5c9963ca5
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/do02.f90
@@ -0,0 +1,21 @@
+! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
+! XFAIL: *
+
+! OpenMP Version 4.5
+! 2.7.1 Loop Construct
+! Exit statement terminating !$OMP DO loop
+
+program omp_do
+  integer i, j, k
+
+  !$omp do
+  do i = 1, 10
+    do j = 1, 10
+      print *, "Hello"
+    end do
+    !ERROR: EXIT statement terminating !$OMP DO loop
+    exit
+  end do
+  !$omp end do
+
+end program omp_do
diff --git a/flang/test/Semantics/OpenMP/reduction-modifiers.f90 b/flang/test/Semantics/OpenMP/reduction-modifiers.f90
new file mode 100644
index 000000000000..cf38200ba0a8
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/reduction-modifiers.f90
@@ -0,0 +1,89 @@
+! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp -fopenmp-version=52
+
+subroutine mod_task1(x)
+  integer, intent(inout) :: x
+
+  !Correct: "parallel" directive.
+  !$omp parallel reduction(task, +:x)
+  do i = 1, 100
+    x = foo(i)
+  enddo
+  !$omp end parallel
+end
+
+subroutine mod_task2(x)
+  integer, intent(inout) :: x
+
+  !Correct: worksharing directive.
+  !$omp sections reduction(task, +:x)
+  do i = 1, 100
+    x = foo(i)
+  enddo
+  !$omp end sections
+end
+
+subroutine mod_task3(x)
+  integer, intent(inout) :: x
+
+  !ERROR: Modifier 'TASK' on REDUCTION clause is only allowed with PARALLEL or worksharing directive
+  !$omp simd reduction(task, +:x)
+  do i = 1, 100
+    x = foo(i)
+  enddo
+  !$omp end simd
+end
+
+subroutine mod_inscan1(x)
+  integer, intent(inout) :: x
+
+  !Correct: worksharing-loop directive
+  !$omp do reduction(inscan, +:x)
+  do i = 1, 100
+    x = foo(i)
+  enddo
+  !$omp end do
+end
+
+subroutine mod_inscan2(x)
+  integer, intent(inout) :: x
+
+  !Correct: worksharing-loop simd directive
+  !$omp do simd reduction(inscan, +:x)
+  do i = 1, 100
+    x = foo(i)
+  enddo
+  !$omp end do simd
+end
+
+subroutine mod_inscan3(x)
+  integer, intent(inout) :: x
+
+  !Correct: "simd" directive
+  !$omp simd reduction(inscan, +:x)
+  do i = 1, 100
+    x = foo(i)
+  enddo
+  !$omp end simd
+end
+
+subroutine mod_inscan4(x)
+  integer, intent(inout) :: x
+
+  !ERROR: Modifier 'INSCAN' on REDUCTION clause is only allowed with worksharing-loop, worksharing-loop simd, or SIMD directive
+  !$omp parallel reduction(inscan, +:x)
+  do i = 1, 100
+    x = foo(i)
+  enddo
+  !$omp end parallel
+end
+
+subroutine mod_inscan5(x)
+  integer, intent(inout) :: x
+
+  !ERROR: Modifier 'INSCAN' on REDUCTION clause is only allowed with worksharing-loop, worksharing-loop simd, or SIMD directive
+  !$omp sections reduction(inscan, +:x)
+  do i = 1, 100
+    x = foo(i)
+  enddo
+  !$omp end sections
+end
diff --git a/flang/test/Semantics/OpenMP/sections01.f90 b/flang/test/Semantics/OpenMP/sections01.f90
index 00b5a6d8fbc4..c26cc88dcc7a 100644
--- a/flang/test/Semantics/OpenMP/sections01.f90
+++ b/flang/test/Semantics/OpenMP/sections01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 
 ! OpenMP Version 4.5
diff --git a/flang/test/Semantics/OpenMP/sections02.f90 b/flang/test/Semantics/OpenMP/sections02.f90
index 912e7bc2a8ff..ee29922a72c0 100644
--- a/flang/test/Semantics/OpenMP/sections02.f90
+++ b/flang/test/Semantics/OpenMP/sections02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/sections03.f90 b/flang/test/Semantics/OpenMP/sections03.f90
deleted file mode 100644
index b170f8674d19..000000000000
--- a/flang/test/Semantics/OpenMP/sections03.f90
+++ /dev/null
@@ -1,29 +0,0 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
-! RUN: %python %S/../test_errors.py %s %flang -fopenmp
-!XFAIL: *
-! OpenMP version 5.0.0
-! 2.8.1 sections construct
-! Orphaned section directives are prohibited. That is, the section directives must appear within the sections construct and must not be encountered elsewhere in the sections region
-!TODO: Error in parsing. Make parser errors more informative. Until then, the test is XFAIL
-
-program OmpOrphanedSections
-   use omp_lib
-   integer counter
-   counter = 0
-   !CHECK: expected 'END'
-   !CHECK: END PROGRAM statement
-   !CHECK: in the context: main program
-   !CHECK: expected 'END PROGRAM'
-   !CHECK: in the context: END PROGRAM statement
-   !CHECK: in the context: main program
-   !$omp section
-   print *, "An orphaned section containing a single statement"
-   !$omp section
-   counter = counter + 1
-   print *, "An orphaned section containing multiple statements"
-!$omp sections
-   !$omp section
-   print *, "Not an orphan structured block"
-!$omp end sections
-end program OmpOrphanedSections
diff --git a/flang/test/Semantics/OpenMP/simd-aligned.f90 b/flang/test/Semantics/OpenMP/simd-aligned.f90
index 3ffdc68693fd..0a9f95833e22 100644
--- a/flang/test/Semantics/OpenMP/simd-aligned.f90
+++ b/flang/test/Semantics/OpenMP/simd-aligned.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 
 ! OpenMP Version 4.5
diff --git a/flang/test/Semantics/OpenMP/simd-nontemporal.f90 b/flang/test/Semantics/OpenMP/simd-nontemporal.f90
index 074b0a2039ed..a488edd98cdc 100644
--- a/flang/test/Semantics/OpenMP/simd-nontemporal.f90
+++ b/flang/test/Semantics/OpenMP/simd-nontemporal.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 
 ! OpenMP Version 4.5
diff --git a/flang/test/Semantics/OpenMP/simd01.f90 b/flang/test/Semantics/OpenMP/simd01.f90
index 1e241648f75a..1aa2880cda83 100644
--- a/flang/test/Semantics/OpenMP/simd01.f90
+++ b/flang/test/Semantics/OpenMP/simd01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 5.0
 ! 2.9.3.1 simd Construct
diff --git a/flang/test/Semantics/OpenMP/simd02.f90 b/flang/test/Semantics/OpenMP/simd02.f90
index 24d6abd9761f..a627e2ac2d67 100644
--- a/flang/test/Semantics/OpenMP/simd02.f90
+++ b/flang/test/Semantics/OpenMP/simd02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 
 ! OpenMP Version 4.5
diff --git a/flang/test/Semantics/OpenMP/simd03.f90 b/flang/test/Semantics/OpenMP/simd03.f90
index 8df48368fa96..8c90eba8fd8e 100644
--- a/flang/test/Semantics/OpenMP/simd03.f90
+++ b/flang/test/Semantics/OpenMP/simd03.f90
@@ -1,6 +1,4 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
-! RUN: %S/test_errors.sh %s %t %flang -fopenmp
+! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
 ! XFAIL: *
 
 ! OpenMP Version 4.5
diff --git a/flang/test/Semantics/OpenMP/single01.f90 b/flang/test/Semantics/OpenMP/single01.f90
index 0468e695d8cf..2e40bec56e9c 100644
--- a/flang/test/Semantics/OpenMP/single01.f90
+++ b/flang/test/Semantics/OpenMP/single01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 4.5
 ! 2.7.3 single Construct
diff --git a/flang/test/Semantics/OpenMP/single02.f90 b/flang/test/Semantics/OpenMP/single02.f90
index 9d9d306c2f53..03cf7fbb6ad3 100644
--- a/flang/test/Semantics/OpenMP/single02.f90
+++ b/flang/test/Semantics/OpenMP/single02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 4.5
 ! 2.7.3 single Construct
diff --git a/flang/test/Semantics/OpenMP/struct.f90 b/flang/test/Semantics/OpenMP/struct.f90
index 3d2000aef993..8ae1fbe4da86 100644
--- a/flang/test/Semantics/OpenMP/struct.f90
+++ b/flang/test/Semantics/OpenMP/struct.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! Check OpenMP compatibility with the DEC STRUCTURE extension
 
diff --git a/flang/test/Semantics/OpenMP/symbol01.f90 b/flang/test/Semantics/OpenMP/symbol01.f90
index e2a9c01e9d5f..0b435a9ab985 100644
--- a/flang/test/Semantics/OpenMP/symbol01.f90
+++ b/flang/test/Semantics/OpenMP/symbol01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
 
 ! Test clauses that accept list.
diff --git a/flang/test/Semantics/OpenMP/symbol02.f90 b/flang/test/Semantics/OpenMP/symbol02.f90
index 1b1dc4489448..f6ffc5500d0a 100644
--- a/flang/test/Semantics/OpenMP/symbol02.f90
+++ b/flang/test/Semantics/OpenMP/symbol02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
 
 ! 1.4.1 Structure of the OpenMP Memory Model
diff --git a/flang/test/Semantics/OpenMP/symbol03.f90 b/flang/test/Semantics/OpenMP/symbol03.f90
index 76d93577d3ac..93e9b7a3eae6 100644
--- a/flang/test/Semantics/OpenMP/symbol03.f90
+++ b/flang/test/Semantics/OpenMP/symbol03.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
 
 ! 1.4.1 Structure of the OpenMP Memory Model
diff --git a/flang/test/Semantics/OpenMP/symbol04.f90 b/flang/test/Semantics/OpenMP/symbol04.f90
index 8ef154ebbf9d..808d1e0dd09b 100644
--- a/flang/test/Semantics/OpenMP/symbol04.f90
+++ b/flang/test/Semantics/OpenMP/symbol04.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
 
 ! 2.15.3 Data-Sharing Attribute Clauses
diff --git a/flang/test/Semantics/OpenMP/symbol05.f90 b/flang/test/Semantics/OpenMP/symbol05.f90
index d08d85270380..fa0a8f65a429 100644
--- a/flang/test/Semantics/OpenMP/symbol05.f90
+++ b/flang/test/Semantics/OpenMP/symbol05.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
 
 ! 2.15.2 threadprivate Directive
diff --git a/flang/test/Semantics/OpenMP/symbol06.f90 b/flang/test/Semantics/OpenMP/symbol06.f90
index a2cd288dfd15..906264eb1264 100644
--- a/flang/test/Semantics/OpenMP/symbol06.f90
+++ b/flang/test/Semantics/OpenMP/symbol06.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
 
 ! 2.15.3 Data-Sharing Attribute Clauses
diff --git a/flang/test/Semantics/OpenMP/symbol07.f90 b/flang/test/Semantics/OpenMP/symbol07.f90
index ee6cd2a0df2e..e2250f5c7908 100644
--- a/flang/test/Semantics/OpenMP/symbol07.f90
+++ b/flang/test/Semantics/OpenMP/symbol07.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
 
 ! Generic tests
diff --git a/flang/test/Semantics/OpenMP/symbol08.f90 b/flang/test/Semantics/OpenMP/symbol08.f90
index 76db86cd54ca..3af85af74ee9 100644
--- a/flang/test/Semantics/OpenMP/symbol08.f90
+++ b/flang/test/Semantics/OpenMP/symbol08.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
 
 ! 2.15.1.1 Predetermined rules for associated do-loops index variable
diff --git a/flang/test/Semantics/OpenMP/symbol09.f90 b/flang/test/Semantics/OpenMP/symbol09.f90
index ee6cd2a0df2e..e2250f5c7908 100644
--- a/flang/test/Semantics/OpenMP/symbol09.f90
+++ b/flang/test/Semantics/OpenMP/symbol09.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp
 
 ! Generic tests
diff --git a/flang/test/Semantics/OpenMP/sync-critical01.f90 b/flang/test/Semantics/OpenMP/sync-critical01.f90
index ef377ebc72f2..b597eb17ea22 100644
--- a/flang/test/Semantics/OpenMP/sync-critical01.f90
+++ b/flang/test/Semantics/OpenMP/sync-critical01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 
 ! OpenMP Version 5.0
diff --git a/flang/test/Semantics/OpenMP/sync-critical02.f90 b/flang/test/Semantics/OpenMP/sync-critical02.f90
index 681aa7944c4f..1fa9d6ad84f2 100644
--- a/flang/test/Semantics/OpenMP/sync-critical02.f90
+++ b/flang/test/Semantics/OpenMP/sync-critical02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/taskloop01.f90 b/flang/test/Semantics/OpenMP/taskloop01.f90
index 2c5375949404..6bef58438151 100644
--- a/flang/test/Semantics/OpenMP/taskloop01.f90
+++ b/flang/test/Semantics/OpenMP/taskloop01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 4.5
 ! 2.9.2 taskloop Construct
diff --git a/flang/test/Semantics/OpenMP/taskloop02.f90 b/flang/test/Semantics/OpenMP/taskloop02.f90
index 275b079d38a1..867ef8a9806d 100644
--- a/flang/test/Semantics/OpenMP/taskloop02.f90
+++ b/flang/test/Semantics/OpenMP/taskloop02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: not %flang -fsyntax-only -fopenmp %s 2>&1 | FileCheck %s
 ! OpenMP Version 4.5
 ! 2.9.2 taskloop Construct
diff --git a/flang/test/Semantics/OpenMP/taskloop03.f90 b/flang/test/Semantics/OpenMP/taskloop03.f90
new file mode 100644
index 000000000000..3fe6a593bf49
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/taskloop03.f90
@@ -0,0 +1,25 @@
+! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
+! XFAIL: *
+
+! OpenMP Version 4.5
+! 2.9.2 taskloop Construct
+! All loops associated with the taskloop construct must be perfectly nested,
+! there must be no intervening code or any OpenMP directive between
+! any two loops
+
+program omp_taskloop
+  integer i, j
+
+  !$omp taskloop private(j) grainsize(500) nogroup
+  do i=1, 10000
+    do j=1, i
+      call loop_body(i, j)
+    end do
+    !ERROR: Loops associated with !$omp taskloop is not perfectly nested
+    !$omp single
+    print *, "omp single"
+    !$omp end single
+  end do
+  !$omp end taskloop
+
+end program omp_taskloop
diff --git a/flang/test/Semantics/OpenMP/taskwait.f90 b/flang/test/Semantics/OpenMP/taskwait.f90
index a3b15c7a1df0..e60051c9da8a 100644
--- a/flang/test/Semantics/OpenMP/taskwait.f90
+++ b/flang/test/Semantics/OpenMP/taskwait.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 
 !$omp taskwait
diff --git a/flang/test/Semantics/OpenMP/threadprivate01.f90 b/flang/test/Semantics/OpenMP/threadprivate01.f90
index 6597941ac3d5..c2cf9ba99ab0 100644
--- a/flang/test/Semantics/OpenMP/threadprivate01.f90
+++ b/flang/test/Semantics/OpenMP/threadprivate01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! REQUIRES: openmp_runtime
 
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
diff --git a/flang/test/Semantics/OpenMP/threadprivate02.f90 b/flang/test/Semantics/OpenMP/threadprivate02.f90
index 862d1e8a45c4..7f6e8dcc8e8a 100644
--- a/flang/test/Semantics/OpenMP/threadprivate02.f90
+++ b/flang/test/Semantics/OpenMP/threadprivate02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 5.1
 ! Check OpenMP construct validity for the following directives:
diff --git a/flang/test/Semantics/OpenMP/threadprivate03.f90 b/flang/test/Semantics/OpenMP/threadprivate03.f90
index 57d3b9209820..b466a8e05e9c 100644
--- a/flang/test/Semantics/OpenMP/threadprivate03.f90
+++ b/flang/test/Semantics/OpenMP/threadprivate03.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp -pedantic
 ! OpenMP Version 5.1
 ! Check OpenMP construct validity for the following directives:
diff --git a/flang/test/Semantics/OpenMP/threadprivate04.f90 b/flang/test/Semantics/OpenMP/threadprivate04.f90
index 8199dbaea166..3d8c7fb8de8f 100644
--- a/flang/test/Semantics/OpenMP/threadprivate04.f90
+++ b/flang/test/Semantics/OpenMP/threadprivate04.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 5.1
 ! Check OpenMP construct validity for the following directives:
diff --git a/flang/test/Semantics/OpenMP/threadprivate05.f90 b/flang/test/Semantics/OpenMP/threadprivate05.f90
index eecf9e781cf7..cdbf3701b70a 100644
--- a/flang/test/Semantics/OpenMP/threadprivate05.f90
+++ b/flang/test/Semantics/OpenMP/threadprivate05.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 5.1
 ! Check OpenMP construct validity for the following directives:
diff --git a/flang/test/Semantics/OpenMP/threadprivate06.f90 b/flang/test/Semantics/OpenMP/threadprivate06.f90
index 5537a8805e9f..f31c38f6f2b2 100644
--- a/flang/test/Semantics/OpenMP/threadprivate06.f90
+++ b/flang/test/Semantics/OpenMP/threadprivate06.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 5.1
 ! Check OpenMP construct validity for the following directives:
diff --git a/flang/test/Semantics/OpenMP/threadprivate07.f90 b/flang/test/Semantics/OpenMP/threadprivate07.f90
index 5302fdf4ab71..c9a006ca0e08 100644
--- a/flang/test/Semantics/OpenMP/threadprivate07.f90
+++ b/flang/test/Semantics/OpenMP/threadprivate07.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 
 ! Check Threadprivate Directive with local variable of a BLOCK construct.
diff --git a/flang/test/Semantics/OpenMP/use_device_addr.f90 b/flang/test/Semantics/OpenMP/use_device_addr.f90
index dda00d510504..93a7643b5eb4 100644
--- a/flang/test/Semantics/OpenMP/use_device_addr.f90
+++ b/flang/test/Semantics/OpenMP/use_device_addr.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %flang_fc1 -fopenmp -fdebug-dump-symbols %s | FileCheck %s
 ! OpenMP Version 5.1
 ! 2.14.2 use_device_addr clause
diff --git a/flang/test/Semantics/OpenMP/use_device_addr1.f90 b/flang/test/Semantics/OpenMP/use_device_addr1.f90
index c37e9a3a7e3e..867e324b68ad 100644
--- a/flang/test/Semantics/OpenMP/use_device_addr1.f90
+++ b/flang/test/Semantics/OpenMP/use_device_addr1.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 5.0
 ! 2.10.1 use_device_ptr clause
diff --git a/flang/test/Semantics/OpenMP/use_device_ptr.f90 b/flang/test/Semantics/OpenMP/use_device_ptr.f90
index e9e7fbb6c1f5..64b98cf67961 100644
--- a/flang/test/Semantics/OpenMP/use_device_ptr.f90
+++ b/flang/test/Semantics/OpenMP/use_device_ptr.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %flang_fc1 -fopenmp -fdebug-dump-symbols %s | FileCheck %s
 ! OpenMP Version 5.0
 ! 2.10.1 use_device_ptr clause
diff --git a/flang/test/Semantics/OpenMP/use_device_ptr1.f90 b/flang/test/Semantics/OpenMP/use_device_ptr1.f90
index f705c50370da..176fb5f35a84 100644
--- a/flang/test/Semantics/OpenMP/use_device_ptr1.f90
+++ b/flang/test/Semantics/OpenMP/use_device_ptr1.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
 ! OpenMP Version 5.0
 ! 2.10.1 use_device_ptr clause
diff --git a/flang/test/Semantics/OpenMP/workshare01.f90 b/flang/test/Semantics/OpenMP/workshare01.f90
index 615c3408dc7a..9667a306061c 100644
--- a/flang/test/Semantics/OpenMP/workshare01.f90
+++ b/flang/test/Semantics/OpenMP/workshare01.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 4.5
 ! 2.7.4 workshare Construct
diff --git a/flang/test/Semantics/OpenMP/workshare02.f90 b/flang/test/Semantics/OpenMP/workshare02.f90
index b6faf197f1f2..e099ecb9f1e6 100644
--- a/flang/test/Semantics/OpenMP/workshare02.f90
+++ b/flang/test/Semantics/OpenMP/workshare02.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 4.5
 ! 2.7.4 workshare Construct
diff --git a/flang/test/Semantics/OpenMP/workshare03.f90 b/flang/test/Semantics/OpenMP/workshare03.f90
index 2aea0ccce3c7..09d46abf42ee 100644
--- a/flang/test/Semantics/OpenMP/workshare03.f90
+++ b/flang/test/Semantics/OpenMP/workshare03.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 4.5
 ! 2.7.4 workshare Construct
diff --git a/flang/test/Semantics/OpenMP/workshare04.f90 b/flang/test/Semantics/OpenMP/workshare04.f90
index e84459978e15..0ec635e52d2b 100644
--- a/flang/test/Semantics/OpenMP/workshare04.f90
+++ b/flang/test/Semantics/OpenMP/workshare04.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 4.5
 ! 2.7.4 workshare Construct
diff --git a/flang/test/Semantics/OpenMP/workshare05.f90 b/flang/test/Semantics/OpenMP/workshare05.f90
index 30f3b988de91..b57053e092e6 100644
--- a/flang/test/Semantics/OpenMP/workshare05.f90
+++ b/flang/test/Semantics/OpenMP/workshare05.f90
@@ -1,5 +1,3 @@
-! UNSUPPORTED: system-windows
-! Marking as unsupported due to suspected long runtime on Windows
 ! RUN: %python %S/../test_errors.py %s %flang -fopenmp
 ! OpenMP Version 4.5
 ! 2.7.4 workshare Construct
diff --git a/flang/test/Semantics/bind-c12.f90 b/flang/test/Semantics/bind-c12.f90
index 55af8a93b5b5..01a8d0cdbc3d 100644
--- a/flang/test/Semantics/bind-c12.f90
+++ b/flang/test/Semantics/bind-c12.f90
@@ -26,8 +26,8 @@ end
 
 subroutine subr5(p) bind(c)
   interface
+    !WARNING: A dummy procedure of an interoperable procedure should be BIND(C)
     subroutine p(c)
-      !ERROR: An assumed-length dummy argument must not appear in a non-BIND(C) entry in a subprogram with an entry that must be interoperable
       character(*), intent(in) :: c
     end
   end interface
@@ -52,8 +52,8 @@ end
 
 subroutine subr8(p) bind(c)
   interface
+    !WARNING: A dummy procedure of an interoperable procedure should be BIND(C)
     subroutine p(n)
-      !ERROR: A VALUE dummy argument must not appear in a non-BIND(C) entry of a subprogram with an entry that must be interoperable
       integer, intent(in), value :: n
     end
   end interface
diff --git a/flang/test/Semantics/call05.f90 b/flang/test/Semantics/call05.f90
index 66d0a375fa56..71f2197067f7 100644
--- a/flang/test/Semantics/call05.f90
+++ b/flang/test/Semantics/call05.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/test_errors.py %s %flang_fc1
+! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic
 ! Test 15.5.2.5 constraints and restrictions for POINTER & ALLOCATABLE
 ! arguments when both sides of the call have the same attributes.
 
@@ -73,9 +73,9 @@ module m
     call sma(ma) ! ok
     call spp(pp) ! ok
     call spa(pa) ! ok
-    !ERROR: If a POINTER or ALLOCATABLE dummy or actual argument is polymorphic, both must be so
+    !PORTABILITY: If a POINTER or ALLOCATABLE actual argument is polymorphic, the corresponding dummy argument should also be so
     call smp(pp)
-    !ERROR: If a POINTER or ALLOCATABLE dummy or actual argument is polymorphic, both must be so
+    !PORTABILITY: If a POINTER or ALLOCATABLE actual argument is polymorphic, the corresponding dummy argument should also be so
     call sma(pa)
     !ERROR: If a POINTER or ALLOCATABLE dummy or actual argument is polymorphic, both must be so
     call spp(mp)
diff --git a/flang/test/Semantics/call39.f90 b/flang/test/Semantics/call39.f90
index 41eeba100347..724c9f9c7b7d 100644
--- a/flang/test/Semantics/call39.f90
+++ b/flang/test/Semantics/call39.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic -Werror
+! RUN: %python %S/test_errors.py %s %flang_fc1
 ! Tests actual/dummy pointer argument shape mismatches
 module m
  contains
@@ -11,6 +11,15 @@ module m
   subroutine sa(p)
     real, pointer, intent(in) :: p(..)
   end
+  subroutine sao(p)
+    real, intent(in), optional, pointer :: p(..)
+  end
+  subroutine so(x)
+    real, intent(in), optional :: x(..)
+  end
+  subroutine soa(a)
+    real, intent(in), optional, allocatable :: a(..)
+  end
   subroutine test
     real, pointer :: a0, a1(:)
     call s0(null(a0)) ! ok
@@ -23,9 +32,15 @@ module m
     call s1(null(a1)) ! ok
     call sa(null(a0)) ! ok
     call sa(null(a1)) ! ok
-    !ERROR: NULL() without MOLD= must not be associated with an assumed-rank dummy argument
-    call sa(null())
-    !ERROR: NULL() without MOLD= must not be associated with an assumed-rank dummy argument
+    !ERROR: NULL() without MOLD= must not be associated with an assumed-rank dummy argument that is ALLOCATABLE, POINTER, or non-OPTIONAL
     call sa(null())
+    call sao ! ok
+    !ERROR: NULL() without MOLD= must not be associated with an assumed-rank dummy argument that is ALLOCATABLE, POINTER, or non-OPTIONAL
+    call sao(null())
+    call so ! ok
+    call so(null()) ! ok
+    call soa ! ok
+    !ERROR: NULL() without MOLD= must not be associated with an assumed-rank dummy argument that is ALLOCATABLE, POINTER, or non-OPTIONAL
+    call soa(null())
   end
 end
diff --git a/flang/test/Semantics/modfile03.f90 b/flang/test/Semantics/modfile03.f90
index db0caeab973f..eb3136f0aa8b 100644
--- a/flang/test/Semantics/modfile03.f90
+++ b/flang/test/Semantics/modfile03.f90
@@ -135,10 +135,8 @@ module m6d
 end
 !Expect: m6d.mod
 !module m6d
-! use m6a,only:t1
 ! use m6a,only:t2=>t1
-! private::t1
-! type(t2),parameter::p=t1()
+! type(t2),parameter::p=t2()
 !end
 
 module m6e
@@ -178,3 +176,98 @@ end
 ! use m7a,only:x
 ! private::x
 !end
+
+module m8a
+  private foo
+  type t
+   contains
+    procedure, nopass :: foo
+  end type
+ contains
+  pure integer function foo(n)
+    integer, intent(in) :: n
+    foo = n
+  end
+end
+!Expect: m8a.mod
+!module m8a
+!type::t
+!contains
+!procedure,nopass::foo
+!end type
+!private::foo
+!contains
+!pure function foo(n)
+!integer(4),intent(in)::n
+!integer(4)::foo
+!end
+!end
+
+module m8b
+  use m8a
+ contains
+  subroutine foo(x,a)
+    type(t), intent(in) :: x
+    real a(x%foo(10))
+  end
+end
+!Expect: m8b.mod
+!module m8b
+!use m8a,only:m8a$foo=>foo
+!use m8a,only:t
+!private::m8a$foo
+!contains
+!subroutine foo(x,a)
+!type(t),intent(in)::x
+!real(4)::a(1_8:int(m8a$foo(10_4),kind=8))
+!end
+!end
+
+module m9a
+  private
+  public t
+  type t
+    integer n
+   contains
+    procedure f
+  end type
+ contains
+  pure integer function f(x, k)
+    class(t), intent(in) :: x
+    integer, intent(in) :: k
+    f = x%n + k
+  end
+end
+!Expect: m9a.mod
+!module m9a
+!type::t
+!integer(4)::n
+!contains
+!procedure::f
+!end type
+!private::f
+!contains
+!pure function f(x,k)
+!class(t),intent(in)::x
+!integer(4),intent(in)::k
+!integer(4)::f
+!end
+!end
+
+module m9b
+  use m9a
+ contains
+  subroutine s(x, y)
+    class(t), intent(in) :: x
+    real y(x%f(x%n))
+  end
+end
+!Expect: m9b.mod
+!module m9b
+!use m9a,only:t
+!contains
+!subroutine s(x,y)
+!class(t),intent(in)::x
+!real(4)::y(1_8:int(x%f(x%n),kind=8))
+!end
+!end
diff --git a/flang/test/Semantics/procinterface05.f90 b/flang/test/Semantics/procinterface05.f90
new file mode 100644
index 000000000000..8c3afbffb2cf
--- /dev/null
+++ b/flang/test/Semantics/procinterface05.f90
@@ -0,0 +1,14 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+interface a1
+  subroutine s1
+    interface a2
+      subroutine s2
+        !ERROR: Invalid specification expression: reference to local entity 'k'
+        real x(k)
+      end subroutine
+    end interface
+    !ERROR: Invalid specification expression: reference to local entity 'k'
+    real y(k)
+  end subroutine
+end interface
+end
diff --git a/flang/test/Semantics/shape.f90 b/flang/test/Semantics/shape.f90
index f43b81f2b44d..21e293031fd6 100644
--- a/flang/test/Semantics/shape.f90
+++ b/flang/test/Semantics/shape.f90
@@ -2,10 +2,12 @@
 ! Test comparisons that use the intrinsic SHAPE() as an operand
 program testShape
 contains
-  subroutine sub1(arrayDummy)
-    integer :: arrayDummy(:)
+  subroutine sub1(arrayDummy, assumedRank)
+    integer :: arrayDummy(:), assumedRank(..)
     integer, allocatable :: arrayDeferred(:)
     integer :: arrayLocal(2) = [88, 99]
+    integer, parameter :: aRrs = rank(shape(assumedRank))
+    integer(kind=merge(kind(1),-1,aRrs == 1)) :: test_aRrs
     !ERROR: Dimension 1 of left operand has extent 1, but right operand has extent 0
     !ERROR: Dimension 1 of left operand has extent 1, but right operand has extent 0
     if (all(shape(arrayDummy)==shape(8))) then
@@ -45,5 +47,9 @@ contains
     if (all(64==shape(arrayLocal))) then
       print *, "hello"
     end if
+    ! These can't be checked at compilation time
+    if (any(shape(assumedRank) == [1])) stop
+    if (any(lbound(assumedRank) == [1,2])) stop
+    if (any(ubound(assumedRank) == [1,2,3])) stop
   end subroutine sub1
 end program testShape
diff --git a/flang/test/Transforms/debug-complex-1.fir b/flang/test/Transforms/debug-complex-1.fir
new file mode 100644
index 000000000000..a3cbd767d8a5
--- /dev/null
+++ b/flang/test/Transforms/debug-complex-1.fir
@@ -0,0 +1,39 @@
+// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s
+
+// check conversion of complex type of different size. Both fir and mlir
+// variants are checked.
+
+module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.target_triple = "native"} {
+  func.func @test1(%x : !fir.complex<4>) -> !fir.complex<8> {
+  %1 = fir.convert %x : (!fir.complex<4>) -> !fir.complex<8>
+  return %1 : !fir.complex<8>
+  }loc(#loc1)
+  func.func @test2(%x : !fir.complex<4>) -> complex<f64> {
+  %1 = fir.convert %x : (!fir.complex<4>) -> complex<f64>
+  return %1 : complex<f64>
+  }loc(#loc2)
+  func.func @test3(%x : !fir.complex<4>) -> !fir.complex<16> {
+  %1 = fir.convert %x : (!fir.complex<4>) -> !fir.complex<16>
+  return %1 : !fir.complex<16>
+  }loc(#loc3)
+  func.func @test4(%x : !fir.complex<4>) -> complex<f128> {
+  %1 = fir.convert %x : (!fir.complex<4>) -> complex<f128>
+  return %1 : complex<f128>
+  }loc(#loc4)
+}
+#loc1 = loc("./simple.f90":2:1)
+#loc2 = loc("./simple.f90":5:1)
+#loc3 = loc("./simple.f90":8:1)
+#loc4 = loc("./simple.f90":11:1)
+
+// CHECK-DAG: #[[CMPX8:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex", sizeInBits = 128, encoding = DW_ATE_complex_float>
+// CHECK-DAG: #[[CMPX4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex", sizeInBits = 64, encoding = DW_ATE_complex_float>
+// CHECK-DAG: #[[CMPX16:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "complex", sizeInBits = 256, encoding = DW_ATE_complex_float>
+
+// CHECK-DAG: #[[TY1:.*]] = #llvm.di_subroutine_type<{{.*}}types = #[[CMPX8]], #[[CMPX4]]>
+// CHECK-DAG: #[[TY2:.*]] = #llvm.di_subroutine_type<{{.*}}types = #[[CMPX16]], #[[CMPX4]]>
+
+// CHECK-DAG: #llvm.di_subprogram<{{.*}}name = "test1"{{.*}}type = #[[TY1]]>
+// CHECK-DAG: #llvm.di_subprogram<{{.*}}name = "test2"{{.*}}type = #[[TY1]]>
+// CHECK-DAG: #llvm.di_subprogram<{{.*}}name = "test3"{{.*}}type = #[[TY2]]>
+// CHECK-DAG: #llvm.di_subprogram<{{.*}}name = "test4"{{.*}}type = #[[TY2]]>
diff --git a/flang/test/Transforms/debug-fixed-array-type.fir b/flang/test/Transforms/debug-fixed-array-type.fir
new file mode 100644
index 000000000000..401c72541183
--- /dev/null
+++ b/flang/test/Transforms/debug-fixed-array-type.fir
@@ -0,0 +1,34 @@
+// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s
+
+module attributes {} {
+  func.func @_QQmain() attributes {fir.bindc_name = "mn"} {
+    %c7 = arith.constant 7 : index
+    %c8 = arith.constant 8 : index
+    %c6 = arith.constant 6 : index
+    %c5 = arith.constant 5 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+    %0 = fir.alloca !fir.array<3xi32> {bindc_name = "d1", uniq_name = "_QFEd1"}
+    %1 = fircg.ext_declare %0(%c3) {uniq_name = "_QFEd1"} : (!fir.ref<!fir.array<3xi32>>, index) -> !fir.ref<!fir.array<3xi32>> loc(#loc1)
+    %2 = fir.address_of(@_QFEd2) : !fir.ref<!fir.array<2x5xi32>>
+    %3 = fircg.ext_declare %2(%c2, %c5) {uniq_name = "_QFEd2"} : (!fir.ref<!fir.array<2x5xi32>>, index, index) -> !fir.ref<!fir.array<2x5xi32>> loc(#loc2)
+    %4 = fir.address_of(@_QFEd3) : !fir.ref<!fir.array<6x8x7xf32>>
+    %5 = fircg.ext_declare %4(%c6, %c8, %c7) {uniq_name = "_QFEd3"} : (!fir.ref<!fir.array<6x8x7xf32>>, index, index, index) -> !fir.ref<!fir.array<6x8x7xf32>> loc(#loc3)
+    return
+  } loc(#loc4)
+}
+
+#loc1 = loc("test.f90":5:1)
+#loc2 = loc("test.f90":6:11)
+#loc3 = loc("test.f90":7:11)
+#loc4 = loc("test.f90":2:8)
+
+
+// CHECK-DAG: #[[INT:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
+// CHECK-DAG: #[[REAL:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
+// CHECK-DAG: #[[D1TY:.*]] = #llvm.di_composite_type<tag = DW_TAG_array_type{{.*}}baseType = #[[INT]], elements = #llvm.di_subrange<count = 3 : i64, lowerBound = 1 : i64>>
+// CHECK-DAG: #[[D2TY:.*]] = #llvm.di_composite_type<tag = DW_TAG_array_type{{.*}}baseType = #[[INT]], elements = #llvm.di_subrange<count = 2 : i64, lowerBound = 1 : i64>, #llvm.di_subrange<count = 5 : i64, lowerBound = 1 : i64>>
+// CHECK-DAG: #[[D3TY:.*]] = #llvm.di_composite_type<tag = DW_TAG_array_type{{.*}}baseType = #[[REAL]], elements = #llvm.di_subrange<count = 6 : i64, lowerBound = 1 : i64>, #llvm.di_subrange<count = 8 : i64, lowerBound = 1 : i64>, #llvm.di_subrange<count = 7 : i64, lowerBound = 1 : i64>>
+// CHECK-DAG: #llvm.di_local_variable<{{.*}}name = "d1"{{.*}}type = #[[D1TY]]>
+// CHECK-DAG: #llvm.di_local_variable<{{.*}}name = "d2"{{.*}}type = #[[D2TY]]>
+// CHECK-DAG: #llvm.di_local_variable<{{.*}}name = "d3"{{.*}}type = #[[D3TY]]>
diff --git a/flang/test/Transforms/debug-module-1.fir b/flang/test/Transforms/debug-module-1.fir
new file mode 100644
index 000000000000..822ae01b99aa
--- /dev/null
+++ b/flang/test/Transforms/debug-module-1.fir
@@ -0,0 +1,40 @@
+// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s
+
+
+module attributes {} {
+  fir.global @_QMhelperEgli : i32 {
+    %0 = fir.zero_bits i32
+    fir.has_value %0 : i32
+  } loc(#loc1)
+  fir.global @_QMhelperEglr : f32 {
+    %0 = fir.zero_bits f32
+    fir.has_value %0 : f32
+  } loc(#loc2)
+  func.func @_QMhelperPtest() {
+    %c67_i32 = arith.constant 67 : i32
+    %cst = arith.constant 1.234000e+01 : f32
+    %0 = fir.address_of(@_QMhelperEgli) : !fir.ref<i32>
+    %1 = fir.address_of(@_QMhelperEglr) : !fir.ref<f32>
+    fir.store %cst to %1 : !fir.ref<f32>
+    fir.store %c67_i32 to %0 : !fir.ref<i32>
+    return
+  } loc(#loc3)
+}
+#loc1 = loc("test.f90":12:11)
+#loc2 = loc("test.f90":15:8)
+#loc3 = loc("test.f90":20:5)
+
+// CHECK-DAG: #[[I4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
+// CHECK-DAG: #[[R4:.*]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
+// CHECK-DAG: #[[CU:.*]] = #llvm.di_compile_unit<{{.*}}>
+// CHECK-DAG: #[[MOD:.*]] = #llvm.di_module<{{.*}}scope = #[[CU]], name = "helper"{{.*}}>
+// CHECK-DAG: #[[LOC1:.*]] = loc("{{.*}}test.f90":12{{.*}})
+// CHECK-DAG: #[[GLI:.*]] = #llvm.di_global_variable<scope = #[[MOD]], name = "gli", linkageName = "_QMhelperEgli"{{.*}}line = 12, type = #[[I4]], isDefined = true>
+// CHECK-DAG: #[[LOC2:.*]] = loc("{{.*}}test.f90":15{{.*}})
+// CHECK-DAG: #[[GLR:.*]] = #llvm.di_global_variable<scope = #[[MOD]], name = "glr", linkageName = "_QMhelperEglr"{{.*}}line = 15, type = #[[R4]], isDefined = true>
+// CHECK-DAG: #[[LOC3:.*]] = loc("{{.*}}test.f90":20{{.*}})
+// CHECK-DAG: #[[TEST:.*]] = #llvm.di_subprogram<{{.*}}compileUnit = #[[CU]], scope = #[[MOD]], name = "test", linkageName = "_QMhelperPtest"{{.*}}line = 20, scopeLine = 20{{.*}}>
+// CHECK-DAG: loc(fused<#[[GLI]]>[#[[LOC1]]])
+// CHECK-DAG: loc(fused<#[[GLR]]>[#[[LOC2]]])
+// CHECK-DAG: loc(fused<#[[TEST]]>[#[[LOC3]]])
+
diff --git a/flang/test/Transforms/debug-module-2.fir b/flang/test/Transforms/debug-module-2.fir
new file mode 100644
index 000000000000..6acdc1df23d2
--- /dev/null
+++ b/flang/test/Transforms/debug-module-2.fir
@@ -0,0 +1,35 @@
+// RUN: fir-opt --fir-to-llvm-ir="target=x86_64-unknown-linux-gnu" --mlir-print-debuginfo %s | FileCheck %s
+
+module {
+  fir.global @_QMhelperEgli : i32 {
+    %0 = fir.zero_bits i32
+    fir.has_value %0 : i32
+  } loc(#loc3)
+  fir.global @_QMhelperEglr : f32 {
+    %0 = fir.zero_bits f32
+    fir.has_value %0 : f32
+  } loc(#loc4)
+}
+#di_basic_type = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "integer", sizeInBits = 32, encoding = DW_ATE_signed>
+#di_basic_type1 = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "real", sizeInBits = 32, encoding = DW_ATE_float>
+
+#di_file = #llvm.di_file<"test.f90" in "">
+#di_subroutine_type = #llvm.di_subroutine_type<callingConvention = DW_CC_normal>
+
+#di_compile_unit = #llvm.di_compile_unit<id = distinct[0]<>, sourceLanguage = DW_LANG_Fortran95, file = #di_file, producer = "flang version 19.0.0 (/home/haqadeer/work/llvm-project/flang 5d5c73cad421bdca6e43e1cc10704ff160f1a33e)", isOptimized = false, emissionKind = Full>
+#di_module = #llvm.di_module<file = #di_file, scope = #di_compile_unit, name = "helper", line = 11>
+#di_global_variable = #llvm.di_global_variable<scope = #di_module, name = "gli", linkageName = "_QMhelperEgli", file = #di_file, line = 12, type = #di_basic_type, isDefined = true>
+#di_global_variable1 = #llvm.di_global_variable<scope = #di_module, name = "glr", linkageName = "_QMhelperEglr", file = #di_file, line = 15, type = #di_basic_type1, isDefined = true>
+
+#loc1 = loc("test.f90":12:11)
+#loc2 = loc("test.f90":15:8)
+#loc3 = loc(fused<#di_global_variable>[#loc1])
+#loc4 = loc(fused<#di_global_variable1>[#loc2])
+
+
+// CHECK-DAG: #[[GLI:.*]] = #llvm.di_global_variable<{{.*}}name = "gli", linkageName = "_QMhelperEgli"{{.*}}>
+// CHECK-DAG: #[[GLR:.*]] = #llvm.di_global_variable<{{.*}}name = "glr", linkageName = "_QMhelperEglr"{{.*}}>
+// CHECK-DAG: #[[GLIE:.*]] = #llvm.di_global_variable_expression<var = #[[GLI]]>
+// CHECK-DAG: #[[GLRE:.*]] = #llvm.di_global_variable_expression<var = #[[GLR]]>
+// CHECK-DAG: llvm.mlir.global{{.*}}@_QMhelperEgli() {{{.*}}dbg_expr = #[[GLIE]]}
+// CHECK-DAG: llvm.mlir.global{{.*}}@_QMhelperEglr() {{{.*}}dbg_expr = #[[GLRE]]}
diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
index 0649e9f7a767..134c5143d6d6 100644
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -246,9 +246,6 @@ function(create_entrypoint_object fq_target_name)
   if(NOT ADD_ENTRYPOINT_OBJ_SRCS)
     message(FATAL_ERROR "`add_entrypoint_object` rule requires SRCS to be specified.")
   endif()
-  if(NOT ADD_ENTRYPOINT_OBJ_HDRS)
-    message(FATAL_ERROR "`add_entrypoint_object` rule requires HDRS to be specified.")
-  endif()
   if(NOT ADD_ENTRYPOINT_OBJ_CXX_STANDARD)
     set(ADD_ENTRYPOINT_OBJ_CXX_STANDARD ${CMAKE_CXX_STANDARD})
   endif()
diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt
index 4e3d1cb9f533..7fb82c60a1bb 100644
--- a/libc/config/baremetal/arm/entrypoints.txt
+++ b/libc/config/baremetal/arm/entrypoints.txt
@@ -183,6 +183,10 @@ set(TARGET_LIBC_ENTRYPOINTS
 
     # time.h entrypoints
     libc.src.time.difftime
+
+    # internal entrypoints
+    libc.startup.baremetal.init
+    libc.startup.baremetal.fini
 )
 
 set(TARGET_LIBM_ENTRYPOINTS
diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt
index 7efd9bcd5b3c..b769b43f03a2 100644
--- a/libc/config/baremetal/riscv/entrypoints.txt
+++ b/libc/config/baremetal/riscv/entrypoints.txt
@@ -183,6 +183,10 @@ set(TARGET_LIBC_ENTRYPOINTS
 
     # time.h entrypoints
     libc.src.time.difftime
+
+    # internal entrypoints
+    libc.startup.baremetal.init
+    libc.startup.baremetal.fini
 )
 
 set(TARGET_LIBM_ENTRYPOINTS
diff --git a/libc/docs/ctype.rst b/libc/docs/ctype.rst
index 7d77dadccc9b..828785c9b670 100644
--- a/libc/docs/ctype.rst
+++ b/libc/docs/ctype.rst
@@ -1,7 +1,11 @@
 .. include:: check.rst
 
-ctype.h Functions
-=================
+=======
+ctype.h
+=======
+
+Functions
+=========
 
 .. list-table::
   :widths: auto
@@ -10,46 +14,61 @@ ctype.h Functions
 
   * - Function
     - Implemented
-    - Standard
+    - C23 Standard Section
+    - POSIX.1-2017 Standard Section
   * - isalnum
     - |check|
     - 7.4.1.1
+    -
   * - isalpha
     - |check|
     - 7.4.1.2
+    -
   * - isblank
     - |check|
     - 7.4.1.3
+    -
   * - iscntrl
     - |check|
     - 7.4.1.4
+    -
   * - isdigit
     - |check|
     - 7.4.1.5
+    -
   * - isgraph
     - |check|
     - 7.4.1.6
+    -
   * - islower
     - |check|
     - 7.4.1.7
+    -
   * - isprint
     - |check|
     - 7.4.1.8
+    -
   * - ispunct
     - |check|
     - 7.4.1.9
+    -
   * - isspace
     - |check|
     - 7.4.1.10
+    -
   * - isupper
     - |check|
     - 7.4.1.11
+    -
   * - isxdigit
     - |check|
     - 7.4.1.12
+    -
   * - tolower
     - |check|
     - 7.4.2.1
+    -
   * - toupper
     - |check|
     - 7.4.2.2
+    -
diff --git a/libc/docs/fenv.rst b/libc/docs/fenv.rst
index 1dee5515e117..e7a5a3fb2c81 100644
--- a/libc/docs/fenv.rst
+++ b/libc/docs/fenv.rst
@@ -1,7 +1,11 @@
 .. include:: check.rst
 
-fenv.h Functions
-================
+======
+fenv.h
+======
+
+Macros
+======
 
 .. list-table::
   :widths: auto
@@ -10,55 +14,162 @@ fenv.h Functions
 
   * - Function
     - Implemented
-    - Standard
+    - C23 Standard Section
+    - POSIX.1-2017 Standard Section
+  * - FE_ALL_EXCEPT
+    - |check|
+    - 7.6.12
+    -
+  * - FE_DEC_DOWNWARD
+    -
+    - 7.6.14
+    -
+  * - FE_DEC_TONEAREST
+    -
+    - 7.6.14
+    -
+  * - FE_DEC_TONEARESTFROMZERO
+    -
+    - 7.6.14
+    -
+  * - FE_DEC_TOWARDZERO
+    -
+    - 7.6.14
+    -
+  * - FE_DEC_UPWARD
+    -
+    - 7.6.14
+    -
+  * - FE_DFL_ENV
+    - |check|
+    - 7.6.17
+    -
+  * - FE_DFL_MODE
+    -
+    - 7.6.11
+    -
+  * - FE_DIVBYZERO
+    - |check|
+    - 7.6.9
+    -
+  * - FE_DOWNARD
+    -
+    - 7.6.13
+    -
+  * - FE_INEXACT
+    - |check|
+    - 7.6.9
+    -
+  * - FE_INVALID
+    - |check|
+    - 7.6.9
+    -
+  * - FE_OVERFLOW
+    - |check|
+    - 7.6.9
+    -
+  * - FE_TONEAREST
+    - |check|
+    - 7.6.13
+    -
+  * - FE_TONEARESTFROMZERO
+    -
+    - 7.6.13
+    -
+  * - FE_TOWARDZERO
+    - |check|
+    - 7.6.13
+    -
+  * - FE_UNDERFLOW
+    - |check|
+    - 7.6.9
+    -
+  * - FE_UPWARD
+    - |check|
+    - 7.6.13
+    -
+  * - __STDC_VERSION_FENV_H__
+    -
+    - 7.6.5
+    -
+
+Functions
+=========
+
+.. list-table::
+  :widths: auto
+  :align: center
+  :header-rows: 1
+
+  * - Function
+    - Implemented
+    - C23 Standard Section
+    - POSIX.1-2017 Standard Section
   * - fe_dec_getround
     -
     - 7.6.5.3
+    -
   * - fe_dec_setround
     -
     - 7.6.5.6
+    -
   * - feclearexcept
     - |check|
     - 7.6.4.1
+    -
   * - fegetenv
     - |check|
     - 7.6.6.1
+    -
   * - fegetexceptflag
     - |check|
     - 7.6.4.2
+    -
   * - fegetmode
     -
     - 7.6.5.1
+    -
   * - fegetround
     - |check|
     - 7.6.5.2
+    -
   * - feholdexcept
     - |check|
     - 7.6.6.2
+    -
   * - feraiseexcept
     - |check|
     - 7.6.4.3
+    -
   * - fesetenv
     - |check|
     - 7.6.6.3
+    -
   * - fesetexcept
     - |check|
     - 7.6.4.4
+    -
   * - fesetexceptflag
     - |check|
     - 7.6.4.5
+    -
   * - fesetmode
     -
     - 7.6.5.4
+    -
   * - fesetround
     - |check|
     - 7.6.5.5
+    -
   * - fetestexcept
     - |check|
     - 7.6.4.7
+    -
   * - fetestexceptflag
     - |check|
     - 7.6.4.6
+    -
   * - feupdateenv
     - |check|
     - 7.6.6.4
+    -
diff --git a/libc/docs/signal.rst b/libc/docs/signal.rst
index 7903bb439cb3..d1a7cb609560 100644
--- a/libc/docs/signal.rst
+++ b/libc/docs/signal.rst
@@ -1,7 +1,160 @@
 .. include:: check.rst
 
-signal.h Functions
-==================
+========
+signal.h
+========
+
+Macros
+======
+
+.. list-table::
+  :widths: auto
+  :align: center
+  :header-rows: 1
+
+  * - Function
+    - Implemented
+    - C23 Standard Section
+    - POSIX.1-2017 Standard Section
+  * - SIGABRT
+    - |check|
+    - 7.14.3
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGALRM
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGBUS
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGCHLD
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGCONT
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGFPE
+    - |check|
+    - 7.14.3
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGHUP
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGILL
+    - |check|
+    - 7.14.3
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGINT
+    - |check|
+    - 7.14.3
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGKILL
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGPIPE
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGPOLL
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGPROF
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGQUIT
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGRTMAX
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGRTMIN
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGSEGV
+    - |check|
+    - 7.14.3
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGSTOP
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGSYS
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGTERM
+    - |check|
+    - 7.14.3
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGTRAP
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGTSTP
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGTTIN
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGTTOU
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGURG
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGUSR1
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGUSR2
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGVTALRM
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGXCPU
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIGXFSZ
+    - |check|
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIG_DFL
+    - |check|
+    - 7.14.3
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIG_ERR
+    - |check|
+    - 7.14.3
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIG_HOLD
+    -
+    -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+  * - SIG_IGN
+    - |check|
+    - 7.14.3
+    - https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html
+
+Functions
+=========
 
 .. list-table::
   :widths: auto
@@ -10,34 +163,45 @@ signal.h Functions
 
   * - Function
     - Implemented
-    - Standard
+    - C23 Standard Section
+    - POSIX.1-2017 Standard Section
   * - kill
     - |check|
     -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/functions/kill.html
   * - raise
     - |check|
     - 7.14.2.1
+    - https://pubs.opengroup.org/onlinepubs/9699919799/functions/raise.html
   * - sigaction
     - |check|
     -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigaction.html
   * - sigaddset
     - |check|
     -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigaddset.html
   * - sigaltstack
     - |check|
     -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigaltstack.html
   * - sigdelset
     - |check|
     -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigdelset.html
   * - sigemptyset
     - |check|
     -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigemptyset.html
   * - sigfillset
     - |check|
     -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigfillset.html
   * - signal
     - |check|
     - 7.14.1.1
+    - https://pubs.opengroup.org/onlinepubs/9699919799/functions/signal.html
   * - sigprocmask
     - |check|
     -
+    - https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigprocmask.html
diff --git a/libc/docs/stdbit.rst b/libc/docs/stdbit.rst
index 0a12b2b6d7b5..71f9bbfd1d00 100644
--- a/libc/docs/stdbit.rst
+++ b/libc/docs/stdbit.rst
@@ -1,7 +1,96 @@
 .. include:: check.rst
 
-stdbit.h Functions
-==================
+========
+stdbit.h
+========
+
+Macros
+======
+
+.. list-table::
+  :widths: auto
+  :align: center
+  :header-rows: 1
+
+  * - Function
+    - Implemented
+    - C23 Standard Section
+    - POSIX.1-2017 Standard Section
+  * - __STDC_ENDIAN_BIG__
+    - |check|
+    - 7.18.2.2
+    -
+  * - __STDC_ENDIAN_LITTLE__
+    - |check|
+    - 7.18.2.2
+    -
+  * - __STDC_ENDIAN_NATIVE__
+    - |check|
+    - 7.18.2.2
+    -
+  * - __STDC_VERSION_STDBIT_H__
+    - |check|
+    - 7.18.1.2
+    -
+  * - stdc_bit_ceil
+    - |check|
+    - 7.18.16.1
+    -
+  * - stdc_bit_floor
+    - |check|
+    - 7.18.15.1
+    -
+  * - stdc_bit_width
+    - |check|
+    - 7.18.14.1
+    -
+  * - stdc_count_ones
+    - |check|
+    - 7.18.12.1
+    -
+  * - stdc_count_zeros
+    - |check|
+    - 7.18.11.1
+    -
+  * - stdc_first_leading_one
+    - |check|
+    - 7.18.8.1
+    -
+  * - stdc_first_leading_zero
+    - |check|
+    - 7.18.7.1
+    -
+  * - stdc_first_trailing_one
+    - |check|
+    - 7.18.10.1
+    -
+  * - stdc_first_trailing_zero
+    - |check|
+    - 7.18.9.1
+    -
+  * - stdc_has_single_bit
+    - |check|
+    - 7.18.13.1
+    -
+  * - stdc_leading_ones
+    - |check|
+    - 7.18.4.1
+    -
+  * - stdc_leading_zeros
+    - |check|
+    - 7.18.3.1
+    -
+  * - stdc_trailing_ones
+    - |check|
+    - 7.18.6.1
+    -
+  * - stdc_trailing_zeros
+    - |check|
+    - 7.18.5.1
+    -
+
+Functions
+=========
 
 .. list-table::
   :widths: auto
@@ -10,214 +99,285 @@ stdbit.h Functions
 
   * - Function
     - Implemented
-    - Standard
+    - C23 Standard Section
+    - POSIX.1-2017 Standard Section
   * - stdc_bit_ceil_uc
     - |check|
     - 7.18.16
+    -
   * - stdc_bit_ceil_ui
     - |check|
     - 7.18.16
+    -
   * - stdc_bit_ceil_ul
     - |check|
     - 7.18.16
+    -
   * - stdc_bit_ceil_ull
     - |check|
     - 7.18.16
+    -
   * - stdc_bit_ceil_us
     - |check|
     - 7.18.16
+    -
   * - stdc_bit_floor_uc
     - |check|
     - 7.18.15
+    -
   * - stdc_bit_floor_ui
     - |check|
     - 7.18.15
+    -
   * - stdc_bit_floor_ul
     - |check|
     - 7.18.15
+    -
   * - stdc_bit_floor_ull
     - |check|
     - 7.18.15
+    -
   * - stdc_bit_floor_us
     - |check|
     - 7.18.15
+    -
   * - stdc_bit_width_uc
     - |check|
     - 7.18.14
+    -
   * - stdc_bit_width_ui
     - |check|
     - 7.18.14
+    -
   * - stdc_bit_width_ul
     - |check|
     - 7.18.14
+    -
   * - stdc_bit_width_ull
     - |check|
     - 7.18.14
+    -
   * - stdc_bit_width_us
     - |check|
     - 7.18.14
+    -
   * - stdc_count_ones_uc
     - |check|
     - 7.18.12
+    -
   * - stdc_count_ones_ui
     - |check|
     - 7.18.12
+    -
   * - stdc_count_ones_ul
     - |check|
     - 7.18.12
+    -
   * - stdc_count_ones_ull
     - |check|
     - 7.18.12
+    -
   * - stdc_count_ones_us
     - |check|
     - 7.18.12
+    -
   * - stdc_count_zeros_uc
     - |check|
     - 7.18.11
+    -
   * - stdc_count_zeros_ui
     - |check|
     - 7.18.11
+    -
   * - stdc_count_zeros_ul
     - |check|
     - 7.18.11
+    -
   * - stdc_count_zeros_ull
     - |check|
     - 7.18.11
+    -
   * - stdc_count_zeros_us
     - |check|
     - 7.18.11
+    -
   * - stdc_first_leading_one_uc
     - |check|
     - 7.18.8
+    -
   * - stdc_first_leading_one_ui
     - |check|
     - 7.18.8
+    -
   * - stdc_first_leading_one_ul
     - |check|
     - 7.18.8
+    -
   * - stdc_first_leading_one_ull
     - |check|
     - 7.18.8
+    -
   * - stdc_first_leading_one_us
     - |check|
     - 7.18.8
+    -
   * - stdc_first_leading_zero_uc
     - |check|
     - 7.18.7
+    -
   * - stdc_first_leading_zero_ui
     - |check|
     - 7.18.7
+    -
   * - stdc_first_leading_zero_ul
     - |check|
     - 7.18.7
+    -
   * - stdc_first_leading_zero_ull
     - |check|
     - 7.18.7
+    -
   * - stdc_first_leading_zero_us
     - |check|
     - 7.18.7
+    -
   * - stdc_first_trailing_one_uc
     - |check|
     - 7.18.10
+    -
   * - stdc_first_trailing_one_ui
     - |check|
     - 7.18.10
+    -
   * - stdc_first_trailing_one_ul
     - |check|
     - 7.18.10
+    -
   * - stdc_first_trailing_one_ull
     - |check|
     - 7.18.10
+    -
   * - stdc_first_trailing_one_us
     - |check|
     - 7.18.10
+    -
   * - stdc_first_trailing_zero_uc
     - |check|
     - 7.18.9
+    -
   * - stdc_first_trailing_zero_ui
     - |check|
     - 7.18.9
+    -
   * - stdc_first_trailing_zero_ul
     - |check|
     - 7.18.9
+    -
   * - stdc_first_trailing_zero_ull
     - |check|
     - 7.18.9
+    -
   * - stdc_first_trailing_zero_us
     - |check|
     - 7.18.9
+    -
   * - stdc_has_single_bit_uc
     - |check|
     - 7.18.13
+    -
   * - stdc_has_single_bit_ui
     - |check|
     - 7.18.13
+    -
   * - stdc_has_single_bit_ul
     - |check|
     - 7.18.13
+    -
   * - stdc_has_single_bit_ull
     - |check|
     - 7.18.13
+    -
   * - stdc_has_single_bit_us
     - |check|
     - 7.18.13
+    -
   * - stdc_leading_ones_uc
     - |check|
     - 7.18.4
+    -
   * - stdc_leading_ones_ui
     - |check|
     - 7.18.4
+    -
   * - stdc_leading_ones_ul
     - |check|
     - 7.18.4
+    -
   * - stdc_leading_ones_ull
     - |check|
     - 7.18.4
+    -
   * - stdc_leading_ones_us
     - |check|
     - 7.18.4
+    -
   * - stdc_leading_zeros_uc
     - |check|
     - 7.18.3
+    -
   * - stdc_leading_zeros_ui
     - |check|
     - 7.18.3
+    -
   * - stdc_leading_zeros_ul
     - |check|
     - 7.18.3
+    -
   * - stdc_leading_zeros_ull
     - |check|
     - 7.18.3
+    -
   * - stdc_leading_zeros_us
     - |check|
     - 7.18.3
+    -
   * - stdc_trailing_ones_uc
     - |check|
     - 7.18.6
+    -
   * - stdc_trailing_ones_ui
     - |check|
     - 7.18.6
+    -
   * - stdc_trailing_ones_ul
     - |check|
     - 7.18.6
+    -
   * - stdc_trailing_ones_ull
     - |check|
     - 7.18.6
+    -
   * - stdc_trailing_ones_us
     - |check|
     - 7.18.6
+    -
   * - stdc_trailing_zeros_uc
     - |check|
     - 7.18.5
+    -
   * - stdc_trailing_zeros_ui
     - |check|
     - 7.18.5
+    -
   * - stdc_trailing_zeros_ul
     - |check|
     - 7.18.5
+    -
   * - stdc_trailing_zeros_ull
     - |check|
     - 7.18.5
+    -
   * - stdc_trailing_zeros_us
     - |check|
     - 7.18.5
+    -
diff --git a/libc/docs/threads.rst b/libc/docs/threads.rst
index 78e17e9fdec3..63cd6c40e145 100644
--- a/libc/docs/threads.rst
+++ b/libc/docs/threads.rst
@@ -1,7 +1,32 @@
 .. include:: check.rst
 
-threads.h Functions
-===================
+=========
+threads.h
+=========
+
+Macros
+======
+
+.. list-table::
+  :widths: auto
+  :align: center
+  :header-rows: 1
+
+  * - Function
+    - Implemented
+    - C23 Standard Section
+    - POSIX.1-2017 Standard Section
+  * - ONCE_FLAG_INIT
+    -
+    - 7.28.1.3
+    -
+  * - TSS_DTOR_ITERATIONS
+    -
+    - 7.28.1.3
+    -
+
+Functions
+=========
 
 .. list-table::
   :widths: auto
@@ -10,79 +35,105 @@ threads.h Functions
 
   * - Function
     - Implemented
-    - Standard
+    - C23 Standard Section
+    - POSIX.1-2017 Standard Section
   * - call_once
     - |check|
     - 7.28.2.1
+    -
   * - cnd_broadcast
     - |check|
     - 7.28.3.1
+    -
   * - cnd_destroy
     - |check|
     - 7.28.3.2
+    -
   * - cnd_init
     - |check|
     - 7.28.3.3
+    -
   * - cnd_signal
     - |check|
     - 7.28.3.4
+    -
   * - cnd_timedwait
     -
     - 7.28.3.5
+    -
   * - cnd_wait
     - |check|
     - 7.28.3.6
+    -
   * - mtx_destroy
     - |check|
     - 7.28.4.1
+    -
   * - mtx_init
     - |check|
     - 7.28.4.2
+    -
   * - mtx_lock
     - |check|
     - 7.28.4.3
+    -
   * - mtx_timedlock
     -
     - 7.28.4.4
+    -
   * - mtx_trylock
     -
     - 7.28.4.5
+    -
   * - mtx_unlock
     - |check|
     - 7.28.4.6
+    -
   * - thrd_create
     - |check|
     - 7.28.5.1
+    -
   * - thrd_current
     - |check|
     - 7.28.5.2
+    -
   * - thrd_detach
     - |check|
     - 7.28.5.3
+    -
   * - thrd_equal
     - |check|
     - 7.28.5.4
+    -
   * - thrd_exit
     - |check|
     - 7.28.5.5
+    -
   * - thrd_join
     - |check|
     - 7.28.5.6
+    -
   * - thrd_sleep
     -
     - 7.28.5.7
+    -
   * - thrd_yield
     -
     - 7.28.5.8
+    -
   * - tss_create
     - |check|
     - 7.28.6.1
+    -
   * - tss_delete
     - |check|
     - 7.28.6.2
+    -
   * - tss_get
     - |check|
     - 7.28.6.3
+    -
   * - tss_set
     - |check|
     - 7.28.6.4
+    -
diff --git a/libc/src/__support/threads/CMakeLists.txt b/libc/src/__support/threads/CMakeLists.txt
index 34412be4dfed..9ea0b59befe7 100644
--- a/libc/src/__support/threads/CMakeLists.txt
+++ b/libc/src/__support/threads/CMakeLists.txt
@@ -71,3 +71,12 @@ if(TARGET libc.src.__support.threads.${LIBC_TARGET_OS}.callonce)
       .${LIBC_TARGET_OS}.callonce
   )
 endif()
+
+if(TARGET libc.src.__support.threads.${LIBC_TARGET_OS}.CndVar)
+  add_object_library(
+    CndVar
+    ALIAS
+    DEPENDS
+    .${LIBC_TARGET_OS}.CndVar
+  )
+endif()
diff --git a/libc/src/__support/threads/CndVar.h b/libc/src/__support/threads/CndVar.h
new file mode 100644
index 000000000000..baa2a686c57d
--- /dev/null
+++ b/libc/src/__support/threads/CndVar.h
@@ -0,0 +1,52 @@
+//===-- A platform independent abstraction layer for cond vars --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC___SUPPORT_SRC_THREADS_LINUX_CNDVAR_H
+#define LLVM_LIBC___SUPPORT_SRC_THREADS_LINUX_CNDVAR_H
+
+#include "src/__support/threads/linux/futex_utils.h" // Futex
+#include "src/__support/threads/mutex.h"             // Mutex
+
+#include <stdint.h> // uint32_t
+
+namespace LIBC_NAMESPACE {
+
+struct CndVar {
+  enum CndWaiterStatus : uint32_t {
+    WS_Waiting = 0xE,
+    WS_Signalled = 0x5,
+  };
+
+  struct CndWaiter {
+    Futex futex_word = WS_Waiting;
+    CndWaiter *next = nullptr;
+  };
+
+  CndWaiter *waitq_front;
+  CndWaiter *waitq_back;
+  Mutex qmtx;
+
+  static int init(CndVar *cv) {
+    cv->waitq_front = cv->waitq_back = nullptr;
+    auto err = Mutex::init(&cv->qmtx, false, false, false);
+    return err == MutexError::NONE ? 0 : -1;
+  }
+
+  static void destroy(CndVar *cv) {
+    cv->waitq_front = cv->waitq_back = nullptr;
+  }
+
+  // Returns 0 on success, -1 on error.
+  int wait(Mutex *m);
+  void notify_one();
+  void broadcast();
+};
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC___SUPPORT_THREADS_LINUX_CNDVAR_H
diff --git a/libc/src/__support/threads/linux/CMakeLists.txt b/libc/src/__support/threads/linux/CMakeLists.txt
index d3353f6b3ff8..39c4ad20201c 100644
--- a/libc/src/__support/threads/linux/CMakeLists.txt
+++ b/libc/src/__support/threads/linux/CMakeLists.txt
@@ -63,3 +63,16 @@ add_object_library(
   DEPENDS
     .futex_utils
 )
+
+add_object_library(
+  CndVar
+  SRCS
+    CndVar.cpp
+  HDRS
+    ../CndVar.h
+  DEPENDS
+    libc.include.sys_syscall
+    libc.src.__support.OSUtil.osutil
+    libc.src.__support.threads.linux.futex_word_type
+    libc.src.__support.threads.mutex
+)
diff --git a/libc/src/__support/threads/linux/CndVar.cpp b/libc/src/__support/threads/linux/CndVar.cpp
new file mode 100644
index 000000000000..daf56bca1ed2
--- /dev/null
+++ b/libc/src/__support/threads/linux/CndVar.cpp
@@ -0,0 +1,103 @@
+//===-- Utility condition variable class ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/threads/CndVar.h"
+#include "src/__support/OSUtil/syscall.h"           // syscall_impl
+#include "src/__support/threads/linux/futex_word.h" // FutexWordType
+#include "src/__support/threads/mutex.h"            // Mutex, MutexLock
+
+#include <sys/syscall.h> // For syscall numbers.
+
+namespace LIBC_NAMESPACE {
+
+int CndVar::wait(Mutex *m) {
+  // The goal is to perform "unlock |m| and wait" in an
+  // atomic operation. However, it is not possible to do it
+  // in the true sense so we do it in spirit. Before unlocking
+  // |m|, a new waiter object is added to the waiter queue with
+  // the waiter queue locked. Iff a signalling thread signals
+  // the waiter before the waiter actually starts waiting, the
+  // wait operation will not begin at all and the waiter immediately
+  // returns.
+
+  CndWaiter waiter;
+  {
+    MutexLock ml(&qmtx);
+    CndWaiter *old_back = nullptr;
+    if (waitq_front == nullptr) {
+      waitq_front = waitq_back = &waiter;
+    } else {
+      old_back = waitq_back;
+      waitq_back->next = &waiter;
+      waitq_back = &waiter;
+    }
+
+    if (m->unlock() != MutexError::NONE) {
+      // If we do not remove the queued up waiter before returning,
+      // then another thread can potentially signal a non-existing
+      // waiter. Note also that we do this with |qmtx| locked. This
+      // ensures that another thread will not signal the withdrawing
+      // waiter.
+      waitq_back = old_back;
+      if (waitq_back == nullptr)
+        waitq_front = nullptr;
+      else
+        waitq_back->next = nullptr;
+
+      return -1;
+    }
+  }
+
+  waiter.futex_word.wait(WS_Waiting, cpp::nullopt, true);
+
+  // At this point, if locking |m| fails, we can simply return as the
+  // queued up waiter would have been removed from the queue.
+  auto err = m->lock();
+  return err == MutexError::NONE ? 0 : -1;
+}
+
+void CndVar::notify_one() {
+  // We don't use an RAII locker in this method as we want to unlock
+  // |qmtx| and signal the waiter using a single FUTEX_WAKE_OP signal.
+  qmtx.lock();
+  if (waitq_front == nullptr)
+    qmtx.unlock();
+
+  CndWaiter *first = waitq_front;
+  waitq_front = waitq_front->next;
+  if (waitq_front == nullptr)
+    waitq_back = nullptr;
+
+  qmtx.futex_word = FutexWordType(Mutex::LockState::Free);
+
+  // this is a special WAKE_OP, so we use syscall directly
+  LIBC_NAMESPACE::syscall_impl<long>(
+      FUTEX_SYSCALL_ID, &qmtx.futex_word.val, FUTEX_WAKE_OP, 1, 1,
+      &first->futex_word.val,
+      FUTEX_OP(FUTEX_OP_SET, WS_Signalled, FUTEX_OP_CMP_EQ, WS_Waiting));
+}
+
+void CndVar::broadcast() {
+  MutexLock ml(&qmtx);
+  uint32_t dummy_futex_word;
+  CndWaiter *waiter = waitq_front;
+  waitq_front = waitq_back = nullptr;
+  while (waiter != nullptr) {
+    // FUTEX_WAKE_OP is used instead of just FUTEX_WAKE as it allows us to
+    // atomically update the waiter status to WS_Signalled before waking
+    // up the waiter. A dummy location is used for the other futex of
+    // FUTEX_WAKE_OP.
+    LIBC_NAMESPACE::syscall_impl<long>(
+        FUTEX_SYSCALL_ID, &dummy_futex_word, FUTEX_WAKE_OP, 1, 1,
+        &waiter->futex_word.val,
+        FUTEX_OP(FUTEX_OP_SET, WS_Signalled, FUTEX_OP_CMP_EQ, WS_Waiting));
+    waiter = waiter->next;
+  }
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/threads/linux/CMakeLists.txt b/libc/src/threads/linux/CMakeLists.txt
index 68b7106c2052..a5a02e47aab3 100644
--- a/libc/src/threads/linux/CMakeLists.txt
+++ b/libc/src/threads/linux/CMakeLists.txt
@@ -1,7 +1,6 @@
 add_header_library(
   threads_utils
   HDRS
-    CndVar.h
     Futex.h
   DEPENDS
     libc.include.sys_syscall
@@ -20,8 +19,8 @@ add_entrypoint_object(
   HDRS
     ../cnd_init.h
   DEPENDS
-    .threads_utils
     libc.include.threads
+    libc.src.__support.threads.CndVar
 )
 
 add_entrypoint_object(
@@ -31,8 +30,8 @@ add_entrypoint_object(
   HDRS
     ../cnd_destroy.h
   DEPENDS
-    .threads_utils
     libc.include.threads
+    libc.src.__support.threads.CndVar
 )
 
 add_entrypoint_object(
@@ -42,9 +41,9 @@ add_entrypoint_object(
   HDRS
     ../cnd_wait.h
   DEPENDS
-    .threads_utils
     libc.include.threads
     libc.src.__support.threads.mutex
+    libc.src.__support.threads.CndVar
 )
 
 add_entrypoint_object(
@@ -54,8 +53,8 @@ add_entrypoint_object(
   HDRS
     ../cnd_signal.h
   DEPENDS
-    .threads_utils
     libc.include.threads
+    libc.src.__support.threads.CndVar
 )
 
 add_entrypoint_object(
@@ -65,6 +64,6 @@ add_entrypoint_object(
   HDRS
     ../cnd_broadcast.h
   DEPENDS
-    .threads_utils
     libc.include.threads
+    libc.src.__support.threads.CndVar
 )
diff --git a/libc/src/threads/linux/CndVar.h b/libc/src/threads/linux/CndVar.h
deleted file mode 100644
index c08ffa393856..000000000000
--- a/libc/src/threads/linux/CndVar.h
+++ /dev/null
@@ -1,148 +0,0 @@
-//===-- Utility condition variable class ------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_THREADS_LINUX_CNDVAR_H
-#define LLVM_LIBC_SRC_THREADS_LINUX_CNDVAR_H
-
-#include "src/__support/CPP/atomic.h"
-#include "src/__support/CPP/mutex.h" // lock_guard
-#include "src/__support/CPP/optional.h"
-#include "src/__support/OSUtil/syscall.h" // For syscall functions.
-#include "src/__support/threads/linux/futex_utils.h"
-#include "src/__support/threads/mutex.h"
-
-#include <linux/futex.h> // For futex operations.
-#include <stdint.h>
-#include <sys/syscall.h> // For syscall numbers.
-#include <threads.h>     // For values like thrd_success etc.
-
-namespace LIBC_NAMESPACE {
-
-struct CndVar {
-  enum CndWaiterStatus : uint32_t {
-    WS_Waiting = 0xE,
-    WS_Signalled = 0x5,
-  };
-
-  struct CndWaiter {
-    Futex futex_word = WS_Waiting;
-    CndWaiter *next = nullptr;
-  };
-
-  CndWaiter *waitq_front;
-  CndWaiter *waitq_back;
-  Mutex qmtx;
-
-  static int init(CndVar *cv) {
-    cv->waitq_front = cv->waitq_back = nullptr;
-    auto err = Mutex::init(&cv->qmtx, false, false, false);
-    return err == MutexError::NONE ? thrd_success : thrd_error;
-  }
-
-  static void destroy(CndVar *cv) {
-    cv->waitq_front = cv->waitq_back = nullptr;
-  }
-
-  int wait(Mutex *m) {
-    // The goal is to perform "unlock |m| and wait" in an
-    // atomic operation. However, it is not possible to do it
-    // in the true sense so we do it in spirit. Before unlocking
-    // |m|, a new waiter object is added to the waiter queue with
-    // the waiter queue locked. Iff a signalling thread signals
-    // the waiter before the waiter actually starts waiting, the
-    // wait operation will not begin at all and the waiter immediately
-    // returns.
-
-    CndWaiter waiter;
-    {
-      cpp::lock_guard ml(qmtx);
-      CndWaiter *old_back = nullptr;
-      if (waitq_front == nullptr) {
-        waitq_front = waitq_back = &waiter;
-      } else {
-        old_back = waitq_back;
-        waitq_back->next = &waiter;
-        waitq_back = &waiter;
-      }
-
-      if (m->unlock() != MutexError::NONE) {
-        // If we do not remove the queued up waiter before returning,
-        // then another thread can potentially signal a non-existing
-        // waiter. Note also that we do this with |qmtx| locked. This
-        // ensures that another thread will not signal the withdrawing
-        // waiter.
-        waitq_back = old_back;
-        if (waitq_back == nullptr)
-          waitq_front = nullptr;
-        else
-          waitq_back->next = nullptr;
-
-        return thrd_error;
-      }
-    }
-
-    waiter.futex_word.wait(WS_Waiting, cpp::nullopt, true);
-
-    // At this point, if locking |m| fails, we can simply return as the
-    // queued up waiter would have been removed from the queue.
-    auto err = m->lock();
-    return err == MutexError::NONE ? thrd_success : thrd_error;
-  }
-
-  int notify_one() {
-    // We don't use an RAII locker in this method as we want to unlock
-    // |qmtx| and signal the waiter using a single FUTEX_WAKE_OP signal.
-    qmtx.lock();
-    if (waitq_front == nullptr) {
-      qmtx.unlock();
-      return thrd_success;
-    }
-
-    CndWaiter *first = waitq_front;
-    waitq_front = waitq_front->next;
-    if (waitq_front == nullptr)
-      waitq_back = nullptr;
-
-    qmtx.futex_word = FutexWordType(Mutex::LockState::Free);
-
-    // this is a special WAKE_OP, so we use syscall directly
-    LIBC_NAMESPACE::syscall_impl<long>(
-        FUTEX_SYSCALL_ID, &qmtx.futex_word.val, FUTEX_WAKE_OP, 1, 1,
-        &first->futex_word.val,
-        FUTEX_OP(FUTEX_OP_SET, WS_Signalled, FUTEX_OP_CMP_EQ, WS_Waiting));
-    return thrd_success;
-  }
-
-  int broadcast() {
-    cpp::lock_guard ml(qmtx);
-    uint32_t dummy_futex_word;
-    CndWaiter *waiter = waitq_front;
-    waitq_front = waitq_back = nullptr;
-    while (waiter != nullptr) {
-      // FUTEX_WAKE_OP is used instead of just FUTEX_WAKE as it allows us to
-      // atomically update the waiter status to WS_Signalled before waking
-      // up the waiter. A dummy location is used for the other futex of
-      // FUTEX_WAKE_OP.
-      LIBC_NAMESPACE::syscall_impl<long>(
-          FUTEX_SYSCALL_ID, &dummy_futex_word, FUTEX_WAKE_OP, 1, 1,
-          &waiter->futex_word.val,
-          FUTEX_OP(FUTEX_OP_SET, WS_Signalled, FUTEX_OP_CMP_EQ, WS_Waiting));
-      waiter = waiter->next;
-    }
-    return thrd_success;
-  }
-};
-
-static_assert(sizeof(CndVar) == sizeof(cnd_t),
-              "Mismatch in the size of the "
-              "internal representation of condition variable and the public "
-              "cnd_t type.");
-
-} // namespace LIBC_NAMESPACE
-
-#endif // LLVM_LIBC_SRC_THREADS_LINUX_CNDVAR_H
diff --git a/libc/src/threads/linux/cnd_broadcast.cpp b/libc/src/threads/linux/cnd_broadcast.cpp
index 180ac6d68ee8..a56aaa21ee12 100644
--- a/libc/src/threads/linux/cnd_broadcast.cpp
+++ b/libc/src/threads/linux/cnd_broadcast.cpp
@@ -6,16 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "CndVar.h"
-
 #include "src/threads/cnd_broadcast.h"
 #include "src/__support/common.h"
+#include "src/__support/threads/CndVar.h"
+
+// TODO: https://github.com/llvm/llvm-project/issues/92968
+#include <threads.h> // cnd_t, thrd_error, thrd_success
 
 namespace LIBC_NAMESPACE {
 
+static_assert(sizeof(CndVar) == sizeof(cnd_t));
+
 LLVM_LIBC_FUNCTION(int, cnd_broadcast, (cnd_t * cond)) {
   CndVar *cndvar = reinterpret_cast<CndVar *>(cond);
-  return cndvar->broadcast();
+  cndvar->broadcast();
+  return thrd_success;
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/threads/linux/cnd_destroy.cpp b/libc/src/threads/linux/cnd_destroy.cpp
index 08eb3a1057b1..2b03b18c48e4 100644
--- a/libc/src/threads/linux/cnd_destroy.cpp
+++ b/libc/src/threads/linux/cnd_destroy.cpp
@@ -6,13 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "CndVar.h"
-
 #include "src/threads/cnd_destroy.h"
 #include "src/__support/common.h"
+#include "src/__support/threads/CndVar.h"
+
+#include <threads.h> // cnd_t
 
 namespace LIBC_NAMESPACE {
 
+static_assert(sizeof(CndVar) == sizeof(cnd_t));
+
 LLVM_LIBC_FUNCTION(void, cnd_destroy, (cnd_t * cond)) {
   CndVar *cndvar = reinterpret_cast<CndVar *>(cond);
   CndVar::destroy(cndvar);
diff --git a/libc/src/threads/linux/cnd_init.cpp b/libc/src/threads/linux/cnd_init.cpp
index 5e3f360b1d2b..d3d2c8a57d82 100644
--- a/libc/src/threads/linux/cnd_init.cpp
+++ b/libc/src/threads/linux/cnd_init.cpp
@@ -6,16 +6,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "CndVar.h"
-
 #include "src/threads/cnd_init.h"
 #include "src/__support/common.h"
+#include "src/__support/threads/CndVar.h"
+
+#include <threads.h> // cnd_t, thrd_error, thrd_success
 
 namespace LIBC_NAMESPACE {
 
+static_assert(sizeof(CndVar) == sizeof(cnd_t));
+
 LLVM_LIBC_FUNCTION(int, cnd_init, (cnd_t * cond)) {
   CndVar *cndvar = reinterpret_cast<CndVar *>(cond);
-  return CndVar::init(cndvar);
+  return CndVar::init(cndvar) ? thrd_error : thrd_success;
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/threads/linux/cnd_signal.cpp b/libc/src/threads/linux/cnd_signal.cpp
index dba01abdefbc..f144013e0882 100644
--- a/libc/src/threads/linux/cnd_signal.cpp
+++ b/libc/src/threads/linux/cnd_signal.cpp
@@ -6,16 +6,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "CndVar.h"
-
 #include "src/threads/cnd_signal.h"
 #include "src/__support/common.h"
+#include "src/__support/threads/CndVar.h"
+
+#include <threads.h> // cnd_t, thrd_error, thrd_success
 
 namespace LIBC_NAMESPACE {
 
+static_assert(sizeof(CndVar) == sizeof(cnd_t));
+
 LLVM_LIBC_FUNCTION(int, cnd_signal, (cnd_t * cond)) {
   CndVar *cndvar = reinterpret_cast<CndVar *>(cond);
-  return cndvar->notify_one();
+  cndvar->notify_one();
+  return thrd_success;
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/threads/linux/cnd_wait.cpp b/libc/src/threads/linux/cnd_wait.cpp
index db3d7f1436eb..97cade3f231d 100644
--- a/libc/src/threads/linux/cnd_wait.cpp
+++ b/libc/src/threads/linux/cnd_wait.cpp
@@ -6,18 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "CndVar.h"
-
+#include "src/threads/cnd_wait.h"
 #include "src/__support/common.h"
+#include "src/__support/threads/CndVar.h"
 #include "src/__support/threads/mutex.h"
-#include "src/threads/cnd_wait.h"
+
+#include <threads.h> // cnd_t, mtx_t, thrd_error, thrd_success
 
 namespace LIBC_NAMESPACE {
 
+static_assert(sizeof(CndVar) == sizeof(cnd_t));
+
 LLVM_LIBC_FUNCTION(int, cnd_wait, (cnd_t * cond, mtx_t *mtx)) {
   CndVar *cndvar = reinterpret_cast<CndVar *>(cond);
   Mutex *mutex = reinterpret_cast<Mutex *>(mtx);
-  return cndvar->wait(mutex);
+  return cndvar->wait(mutex) ? thrd_error : thrd_success;
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/startup/baremetal/CMakeLists.txt b/libc/startup/baremetal/CMakeLists.txt
new file mode 100644
index 000000000000..4faced93fabe
--- /dev/null
+++ b/libc/startup/baremetal/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_entrypoint_object(
+  init
+  SRCS
+    init.cpp
+)
+
+add_entrypoint_object(
+  fini
+  SRCS
+    fini.cpp
+)
diff --git a/libc/startup/baremetal/fini.cpp b/libc/startup/baremetal/fini.cpp
new file mode 100644
index 000000000000..84997fb4fa1d
--- /dev/null
+++ b/libc/startup/baremetal/fini.cpp
@@ -0,0 +1,27 @@
+//===-- Implementation file of __libc_fini_array --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stddef.h>
+#include <stdint.h>
+
+extern "C" {
+extern uintptr_t __fini_array_start[];
+extern uintptr_t __fini_array_end[];
+}
+
+namespace LIBC_NAMESPACE {
+
+using FiniCallback = void(void);
+
+extern "C" void __libc_fini_array(void) {
+  size_t fini_array_size = __fini_array_end - __fini_array_start;
+  for (size_t i = fini_array_size; i > 0; --i)
+    reinterpret_cast<FiniCallback *>(__fini_array_start[i - 1])();
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/startup/baremetal/init.cpp b/libc/startup/baremetal/init.cpp
new file mode 100644
index 000000000000..08dff74f0519
--- /dev/null
+++ b/libc/startup/baremetal/init.cpp
@@ -0,0 +1,32 @@
+//===-- Implementation file of __libc_init_array --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stddef.h>
+#include <stdint.h>
+
+extern "C" {
+extern uintptr_t __preinit_array_start[];
+extern uintptr_t __preinit_array_end[];
+extern uintptr_t __init_array_start[];
+extern uintptr_t __init_array_end[];
+}
+
+namespace LIBC_NAMESPACE {
+
+using InitCallback = void(void);
+
+extern "C" void __libc_init_array(void) {
+  size_t preinit_array_size = __preinit_array_end - __preinit_array_start;
+  for (size_t i = 0; i < preinit_array_size; ++i)
+    reinterpret_cast<InitCallback *>(__preinit_array_start[i])();
+  size_t init_array_size = __init_array_end - __init_array_start;
+  for (size_t i = 0; i < init_array_size; ++i)
+    reinterpret_cast<InitCallback *>(__init_array_start[i])();
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/test/integration/scudo/CMakeLists.txt b/libc/test/integration/scudo/CMakeLists.txt
index 8a085b618044..a5f7e3b63d24 100644
--- a/libc/test/integration/scudo/CMakeLists.txt
+++ b/libc/test/integration/scudo/CMakeLists.txt
@@ -9,6 +9,7 @@ endif()
 # test will have to link to the LLVM libc startup system. LLVM libc's startup
 # system is not complete enough to allow this. It is also desireable to
 # keep the dependencies as minimal as possible.
+
 add_entrypoint_library(
   libc_for_scudo_integration_test
   DEPENDS
@@ -17,6 +18,9 @@ add_entrypoint_library(
     libc.src.stdlib.realloc
     libc.src.stdlib.aligned_alloc
     libc.src.stdlib.free
+    libc.src.errno.errno
+    libc.src.unistd.__llvm_libc_syscall
+    libc.src.sched.__sched_getcpucount
 )
 
 add_executable(
diff --git a/libc/utils/docgen/ctype.json b/libc/utils/docgen/ctype.json
index 25eeb683846c..af97e4bbbc0a 100644
--- a/libc/utils/docgen/ctype.json
+++ b/libc/utils/docgen/ctype.json
@@ -1,46 +1,46 @@
 {
   "functions": {
     "isalnum": {
-      "defined": "7.4.1.1"
+      "c-definition": "7.4.1.1"
     },
     "isalpha": {
-      "defined": "7.4.1.2"
+      "c-definition": "7.4.1.2"
     },
     "isblank": {
-      "defined": "7.4.1.3"
+      "c-definition": "7.4.1.3"
     },
     "iscntrl": {
-      "defined": "7.4.1.4"
+      "c-definition": "7.4.1.4"
     },
     "isdigit": {
-      "defined": "7.4.1.5"
+      "c-definition": "7.4.1.5"
     },
     "isgraph": {
-      "defined": "7.4.1.6"	
+      "c-definition": "7.4.1.6"
     },
     "islower": {
-      "defined": "7.4.1.7"
+      "c-definition": "7.4.1.7"
     },
     "isprint": {
-      "defined": "7.4.1.8"
+      "c-definition": "7.4.1.8"
     },
     "ispunct": {
-      "defined": "7.4.1.9"
+      "c-definition": "7.4.1.9"
     },
     "isspace": {
-      "defined": "7.4.1.10"
+      "c-definition": "7.4.1.10"
     },
     "isupper": {
-      "defined": "7.4.1.11"
+      "c-definition": "7.4.1.11"
     },
     "isxdigit": {
-      "defined": "7.4.1.12"
+      "c-definition": "7.4.1.12"
     },
     "tolower" : {
-      "defined": "7.4.2.1"
+      "c-definition": "7.4.2.1"
     },
     "toupper": {
-      "defined": "7.4.2.2"
+      "c-definition": "7.4.2.2"
     }
   }
 }
diff --git a/libc/utils/docgen/docgen.py b/libc/utils/docgen/docgen.py
index 23d45305fe51..25e22d4d5877 100755
--- a/libc/utils/docgen/docgen.py
+++ b/libc/utils/docgen/docgen.py
@@ -13,70 +13,167 @@ from typing import Dict
 import sys
 import json
 
-
-def load_api(hname: str) -> Dict:
-    p = Path(__file__).parent / Path(hname).with_suffix(".json")
-    api = p.read_text(encoding="utf-8")
+from header import Header
+
+
+class DocgenAPIFormatError(Exception):
+    """Raised on fatal formatting errors with a description of a formatting error"""
+
+
+def check_api(header: Header, api: Dict):
+    """
+    Checks that docgen json files are properly formatted. If there are any
+    fatal formatting errors, raises exceptions with error messages useful for
+    fixing formatting. Warnings are printed to stderr on non-fatal formatting
+    errors. The code that runs after ``check_api(api)`` is called expects that
+    ``check_api`` executed without raising formatting exceptions so the json
+    matches the formatting specified here.
+
+    The json file may contain:
+    * an optional macros object
+    * an optional functions object
+
+    Formatting of ``macros`` and ``functions`` objects
+    ==================================================
+
+    If a macros or functions object is present, then it may contain nested
+    objects. Each of these nested objects should have a name matching a macro
+    or function's name, and each nested object must have the property:
+    ``"c-definition"`` or ``"posix-definition"``.
+
+    Description of properties
+    =========================
+    The defined property is intended to be a reference to a part of the
+    standard that defines the function or macro. For the ``"c-definition"`` property,
+    this should be a C standard section number. For the ``"posix-definition"`` property,
+    this should be a link to the definition.
+
+    :param api: docgen json file contents parsed into a dict
+    """
+    errors = []
+    cdef = "c-definition"
+    pdef = "posix-definition"
+
+    # Validate macros
+    if "macros" in api:
+        if not header.macro_file_exists():
+            print(
+                f"warning: Macro definitions are listed for {header.name}, but no macro file can be found in the directory tree rooted at {header.macros_dir}. All macros will be listed as not implemented.",
+                file=sys.stderr,
+            )
+
+        macros = api["macros"]
+
+        for name, obj in macros.items():
+            if not (cdef in obj or pdef in obj):
+                err = f'error: Macro {name} does not contain at least one required property: "{cdef}" or "{pdef}"'
+                errors.append(err)
+
+    # Validate functions
+    if "functions" in api:
+        if not header.fns_dir_exists():
+            print(
+                f"warning: Function definitions are listed for {header.name}, but no function implementation directory exists at {header.fns_dir}. All functions will be listed as not implemented.",
+                file=sys.stderr,
+            )
+
+        fns = api["functions"]
+        for name, obj in fns.items():
+            if not (cdef in obj or pdef in obj):
+                err = f'error: function {name} does not contain at least one required property: "{cdef}" or "{pdef}"'
+                errors.append(err)
+
+    if errors:
+        raise DocgenAPIFormatError("\n".join(errors))
+
+
+def load_api(header: Header) -> Dict:
+    api = header.docgen_json.read_text(encoding="utf-8")
     return json.loads(api)
 
 
-# TODO: we may need to get more sophisticated for less generic implementations.
-# Does libc/src/{hname minus .h suffix}/{fname}.cpp exist?
-def is_implemented(hname: str, fname: str) -> bool:
-    path = Path(
-        Path(__file__).parent.parent.parent,
-        "src",
-        hname.rstrip(".h")
+def print_tbl_dir():
+    print(
+        f"""
+.. list-table::
+  :widths: auto
+  :align: center
+  :header-rows: 1
+
+  * - Function
+    - Implemented
+    - C23 Standard Section
+    - POSIX.1-2017 Standard Section"""
     )
 
-    if not path.exists():
-        raise FileNotFoundError(f"implementation dir does not exist: {path}")
 
-    if not path.is_dir():
-        raise NotADirectoryError(f"implementation dir is not a dir: {path}")
+def print_functions_rst(header: Header, functions: Dict):
+    tbl_hdr = "Functions"
+    print(tbl_hdr)
+    print("=" * len(tbl_hdr))
+
+    print_tbl_dir()
+
+    for name in sorted(functions.keys()):
+        print(f"  * - {name}")
+
+        if header.fns_dir_exists() and header.implements_fn(name):
+            print("    - |check|")
+        else:
+            print("    -")
+
+        if "c-definition" in functions[name]:
+            print(f'    - {functions[name]["c-definition"]}')
+        else:
+            print("    -")
+
+        if "posix-definition" in functions[name]:
+            print(f'    - {functions[name]["posix-definition"]}')
+        else:
+            print("    -")
 
-    # Recursively search for the target source file in the subdirectories under
-    # libc/src/{hname}.
-    for _ in path.glob("**/" + fname + ".cpp"):
-        return True
 
-    return False
+def print_macros_rst(header: Header, macros: Dict):
+    tbl_hdr = "Macros"
+    print(tbl_hdr)
+    print("=" * len(tbl_hdr))
 
+    print_tbl_dir()
 
-def print_functions(header: str, functions: Dict):
-    for key in sorted(functions.keys()):
-        print(f"  * - {key}")
+    for name in sorted(macros.keys()):
+        print(f"  * - {name}")
 
-        if is_implemented(header, key):
+        if header.macro_file_exists() and header.implements_macro(name):
             print("    - |check|")
         else:
             print("    -")
 
-        # defined is optional. Having any content is optional.
-        if functions[key] is not None and "defined" in functions[key]:
-            print(f'    - {functions[key]["defined"]}')
+        if "c-definition" in macros[name]:
+            print(f'    - {macros[name]["c-definition"]}')
         else:
             print("    -")
 
+        if "posix-definition" in macros[name]:
+            print(f'    - {macros[name]["posix-definition"]}')
+        else:
+            print("    -")
+    print()
+
 
-def print_header(header: str, api: Dict):
+def print_impl_status_rst(header: Header, api: Dict):
     print(".. include:: check.rst\n")
-    fns = f"{header} Functions"
-    print(fns)
-    print("=" * (len(fns)))
-    print(
-        f"""
-.. list-table::
-  :widths: auto
-  :align: center
-  :header-rows: 1
 
-  * - Function
-    - Implemented
-    - Standard"""
-    )
-    # TODO: how do we want to signal implementation of macros?
-    print_functions(header, api["functions"])
+    print("=" * len(header.name))
+    print(header.name)
+    print("=" * len(header.name))
+    print()
+
+    # the macro and function sections are both optional
+    if "macros" in api:
+        print_macros_rst(header, api["macros"])
+
+    if "functions" in api:
+        print_functions_rst(header, api["functions"])
 
 
 def parse_args() -> Namespace:
@@ -88,6 +185,8 @@ def parse_args() -> Namespace:
 
 if __name__ == "__main__":
     args = parse_args()
-    api = load_api(args.header_name)
+    header = Header(args.header_name)
+    api = load_api(header)
+    check_api(header, api)
 
-    print_header(args.header_name, api)
+    print_impl_status_rst(header, api)
diff --git a/libc/utils/docgen/fenv.json b/libc/utils/docgen/fenv.json
index 9aa3f641ddc9..788b196c053b 100644
--- a/libc/utils/docgen/fenv.json
+++ b/libc/utils/docgen/fenv.json
@@ -1,114 +1,114 @@
 {
   "macros": {
     "__STDC_VERSION_FENV_H__": {
-      "defined": "7.6.5"
+      "c-definition": "7.6.5"
     },
     "FE_DIVBYZERO": {
-      "defined": "7.6.9"
+      "c-definition": "7.6.9"
     },
     "FE_INEXACT": {
-      "defined": "7.6.9"
+      "c-definition": "7.6.9"
     },
     "FE_INVALID": {
-      "defined": "7.6.9"
+      "c-definition": "7.6.9"
     },
     "FE_OVERFLOW": {
-      "defined": "7.6.9"
+      "c-definition": "7.6.9"
     },
     "FE_UNDERFLOW": {
-      "defined": "7.6.9"
+      "c-definition": "7.6.9"
     },
     "FE_ALL_EXCEPT": {
-      "defined": "7.6.12"
+      "c-definition": "7.6.12"
     },
     "FE_DFL_MODE": {
-      "defined": "7.6.11"
+      "c-definition": "7.6.11"
     },
     "FE_DOWNARD": {
-      "defined": "7.6.13"
+      "c-definition": "7.6.13"
     },
     "FE_TONEAREST": {
-      "defined": "7.6.13"
+      "c-definition": "7.6.13"
     },
     "FE_TONEARESTFROMZERO": {
-      "defined": "7.6.13"
+      "c-definition": "7.6.13"
     },
     "FE_TOWARDZERO": {
-      "defined": "7.6.13"
+      "c-definition": "7.6.13"
     },
     "FE_UPWARD": {
-      "defined": "7.6.13"
+      "c-definition": "7.6.13"
     },
     "FE_DEC_DOWNWARD": {
-      "defined": "7.6.14"
+      "c-definition": "7.6.14"
     },
     "FE_DEC_TONEAREST": {
-      "defined": "7.6.14"
+      "c-definition": "7.6.14"
     },
     "FE_DEC_TONEARESTFROMZERO": {
-      "defined": "7.6.14"
+      "c-definition": "7.6.14"
     },
     "FE_DEC_TOWARDZERO": {
-      "defined": "7.6.14"
+      "c-definition": "7.6.14"
     },
     "FE_DEC_UPWARD": {
-      "defined": "7.6.14"
+      "c-definition": "7.6.14"
     },
     "FE_DFL_ENV": {
-      "defined": "7.6.17"
+      "c-definition": "7.6.17"
     }
   },
   "functions": {
     "feclearexcept": {
-      "defined": "7.6.4.1"
+      "c-definition": "7.6.4.1"
     },
     "fegetexceptflag": {
-      "defined": "7.6.4.2"
+      "c-definition": "7.6.4.2"
     },
     "feraiseexcept": {
-      "defined": "7.6.4.3"
+      "c-definition": "7.6.4.3"
     },
     "fesetexcept": {
-      "defined": "7.6.4.4"
+      "c-definition": "7.6.4.4"
     },
     "fesetexceptflag": {
-      "defined": "7.6.4.5"
+      "c-definition": "7.6.4.5"
     },
     "fetestexceptflag": {
-      "defined": "7.6.4.6"
+      "c-definition": "7.6.4.6"
     },
     "fetestexcept": {
-      "defined": "7.6.4.7"
+      "c-definition": "7.6.4.7"
     },
     "fegetmode": {
-      "defined": "7.6.5.1"
+      "c-definition": "7.6.5.1"
     },
     "fegetround": {
-      "defined": "7.6.5.2"
+      "c-definition": "7.6.5.2"
     },
     "fe_dec_getround": {
-      "defined": "7.6.5.3"
+      "c-definition": "7.6.5.3"
     },
     "fesetmode": {
-      "defined": "7.6.5.4"
+      "c-definition": "7.6.5.4"
     },
     "fesetround": {
-      "defined": "7.6.5.5"
+      "c-definition": "7.6.5.5"
     },
     "fe_dec_setround": {
-      "defined": "7.6.5.6"
+      "c-definition": "7.6.5.6"
     },
     "fegetenv": {
-      "defined": "7.6.6.1"
+      "c-definition": "7.6.6.1"
     },
     "feholdexcept": {
-      "defined": "7.6.6.2"
+      "c-definition": "7.6.6.2"
     },
     "fesetenv": {
-      "defined": "7.6.6.3"
+      "c-definition": "7.6.6.3"
     },
     "feupdateenv": {
-      "defined": "7.6.6.4"
+      "c-definition": "7.6.6.4"
     }
   }
 }
diff --git a/libc/utils/docgen/header.py b/libc/utils/docgen/header.py
new file mode 100644
index 000000000000..dde210078db2
--- /dev/null
+++ b/libc/utils/docgen/header.py
@@ -0,0 +1,87 @@
+# ====- Information about standard headers used by docgen  ----*- python -*--==#
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ==-------------------------------------------------------------------------==#
+from pathlib import Path
+from typing import Generator
+
+
+class Header:
+    """
+    Maintains implementation information about a standard header file:
+    * where does its implementation dir live
+    * where is its macros file
+    * where is its docgen json file
+
+    By convention, the macro-only part of a header file is in a header-specific
+    file somewhere in the directory tree with root at
+    ``$LLVM_PROJECT_ROOT/libc/include/llvm-libc-macros``.  Docgen expects that
+    if a macro is implemented, that it appears in a string
+    ``#define MACRO_NAME`` in some ``*-macros.h`` file in the directory tree.
+    Docgen searches for this string in the file to set the implementation status
+    shown in the generated rst docs rendered as html for display at
+    <libc.llvm.org>.
+
+    By convention, each function for a header is implemented in a function-specific
+    cpp file somewhere in the directory tree with root at, e.g,
+    ``$LLVM_PROJECT_ROOT/libc/src/fenv``. Some headers have architecture-specific
+    implementations, like ``math``, and some don't, like ``fenv``. Docgen uses the
+    presence of this function-specific cpp file to set the implementation status
+    shown in the generated rst docs rendered as html for display at
+    <libc.llvm.org>.
+    """
+
+    def __init__(self, header_name: str):
+        """
+        :param header_name: e.g., ``"threads.h"`` or ``"signal.h"``
+        """
+        self.name = header_name
+        self.stem = header_name.rstrip(".h")
+        self.docgen_root = Path(__file__).parent
+        self.libc_root = self.docgen_root.parent.parent
+        self.docgen_json = self.docgen_root / Path(header_name).with_suffix(".json")
+        self.fns_dir = Path(self.libc_root, "src", self.stem)
+        self.macros_dir = Path(self.libc_root, "include", "llvm-libc-macros")
+
+    def macro_file_exists(self) -> bool:
+        for _ in self.__get_macro_files():
+            return True
+
+        return False
+
+    def fns_dir_exists(self) -> bool:
+        return self.fns_dir.exists() and self.fns_dir.is_dir()
+
+    def implements_fn(self, fn_name: str) -> bool:
+        for _ in self.fns_dir.glob(f"**/{fn_name}.cpp"):
+            return True
+
+        return False
+
+    def implements_macro(self, m_name: str) -> bool:
+        """
+        Some macro files are in, e.g.,
+        ``$LLVM_PROJECT_ROOT/libc/include/llvm-libc-macros/fenv-macros.h``,
+        but others are in subdirectories, e.g., ``signal.h`` has the macro
+        definitions in
+        ``$LLVM_PROJECT_ROOT/libc/include/llvm-libc-macros/linux/signal-macros.h``.
+
+        :param m_name: name of macro, e.g., ``FE_ALL_EXCEPT``
+        """
+        for f in self.__get_macro_files():
+            if f"#define {m_name}" in f.read_text():
+                return True
+
+        return False
+
+    def __get_macro_files(self) -> Generator[Path, None, None]:
+        """
+        This function uses a glob on, e.g., ``"**/fcntl.macros.h"`` because the
+        macro file might be located in a subdirectory:
+        libc/include/llvm-libc-macros/fcntl-macros.h
+        libc/include/llvm-libc-macros/linux/fcntl-macros.h
+        """
+        return self.macros_dir.glob(f"**/{self.stem}-macros.h")
diff --git a/libc/utils/docgen/signal.json b/libc/utils/docgen/signal.json
index d5380d348b7d..337b0c19717b 100644
--- a/libc/utils/docgen/signal.json
+++ b/libc/utils/docgen/signal.json
@@ -1,47 +1,152 @@
 {
   "macros": {
     "SIG_DFL": {
-      "defined": "7.14.3"
+      "c-definition": "7.14.3",
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
     },
     "SIG_ERR": {
-      "defined": "7.14.3"
+      "c-definition": "7.14.3",
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIG_HOLD": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
     },
     "SIG_IGN": {
-      "defined": "7.14.3"
+      "c-definition": "7.14.3",
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGRTMIN": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGRTMAX": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
     },
     "SIGABRT": {
-      "defined": "7.14.3"
+      "c-definition": "7.14.3",
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGALRM": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGBUS": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGCHLD": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGCONT": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
     },
     "SIGFPE": {
-      "defined": "7.14.3"
+      "c-definition": "7.14.3",
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGHUP": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
     },
     "SIGILL": {
-      "defined": "7.14.3"
+      "c-definition": "7.14.3",
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
     },
     "SIGINT": {
-      "defined": "7.14.3"
+      "c-definition": "7.14.3",
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGKILL": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGPIPE": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGPIPE": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGQUIT": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
     },
     "SIGSEGV": {
-      "defined": "7.14.3"
+      "c-definition": "7.14.3",
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGSTOP": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
     },
     "SIGTERM": {
-      "defined": "7.14.3"
+      "c-definition": "7.14.3",
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGTSTP": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGTTIN": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGTTOU": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGUSR1": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGUSR2": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGPOLL": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGPROF": {
+    "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGSYS": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGTRAP": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGURG": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGVTALRM": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGXCPU": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
+    },
+    "SIGXFSZ": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/signal.h.html"
     }
   },
   "functions": {
     "signal": {
-      "defined": "7.14.1.1"
+      "c-definition": "7.14.1.1",
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/functions/signal.html"
     },
     "raise": {
-      "defined": "7.14.2.1"
-    },
-    "kill": null,
-    "sigaction": null,
-    "sigaddset": null,
-    "sigaltstack": null,
-    "sigdelset": null,
-    "sigemptyset": null,
-    "sigfillset": null,
-    "sigprocmask": null
+      "c-definition": "7.14.2.1",
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/functions/raise.html"
+    },
+    "kill": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/functions/kill.html"
+    },
+    "sigaction": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigaction.html"
+    },
+    "sigaddset": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigaddset.html"
+    },
+    "sigaltstack": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigaltstack.html"
+    },
+    "sigdelset": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigdelset.html"
+    },
+    "sigemptyset": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigemptyset.html"
+    },
+    "sigfillset": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigfillset.html"
+    },
+    "sigprocmask": {
+      "posix-definition": "https://pubs.opengroup.org/onlinepubs/9699919799/functions/sigprocmask.html"
+    }
   }
 }
diff --git a/libc/utils/docgen/stdbit.json b/libc/utils/docgen/stdbit.json
index 88106cf0e4f9..25060c1ff9fd 100644
--- a/libc/utils/docgen/stdbit.json
+++ b/libc/utils/docgen/stdbit.json
@@ -1,270 +1,270 @@
 {
   "macros": {
     "__STDC_VERSION_STDBIT_H__": {
-      "defined": "7.18.1.2"
+      "c-definition": "7.18.1.2"
     },
     "__STDC_ENDIAN_LITTLE__": {
-      "defined": "7.18.2.2"
+      "c-definition": "7.18.2.2"
     },
     "__STDC_ENDIAN_BIG__": {
-      "defined": "7.18.2.2"
+      "c-definition": "7.18.2.2"
     },
     "__STDC_ENDIAN_NATIVE__": {
-      "defined": "7.18.2.2"
+      "c-definition": "7.18.2.2"
     },
     "stdc_leading_zeros": {
-      "defined": "7.18.3.1"
+      "c-definition": "7.18.3.1"
     },
     "stdc_leading_ones": {
-      "defined": "7.18.4.1"
+      "c-definition": "7.18.4.1"
     },
     "stdc_trailing_zeros": {
-      "defined": "7.18.5.1"
+      "c-definition": "7.18.5.1"
     },
     "stdc_trailing_ones": {
-      "defined": "7.18.6.1"
+      "c-definition": "7.18.6.1"
     },
     "stdc_first_leading_zero": {
-      "defined": "7.18.7.1"
+      "c-definition": "7.18.7.1"
     },
     "stdc_first_leading_one": {
-      "defined": "7.18.8.1"
+      "c-definition": "7.18.8.1"
     },
     "stdc_first_trailing_zero": {
-      "defined": "7.18.9.1"
+      "c-definition": "7.18.9.1"
     },
     "stdc_first_trailing_one": {
-      "defined": "7.18.10.1"
+      "c-definition": "7.18.10.1"
     },
     "stdc_count_zeros": {
-      "defined": "7.18.11.1"
+      "c-definition": "7.18.11.1"
     },
     "stdc_count_ones": {
-      "defined": "7.18.12.1"
+      "c-definition": "7.18.12.1"
     },
     "stdc_has_single_bit": {
-      "defined": "7.18.13.1"
+      "c-definition": "7.18.13.1"
     },
     "stdc_bit_width": {
-      "defined": "7.18.14.1"
+      "c-definition": "7.18.14.1"
     },
     "stdc_bit_floor": {
-      "defined": "7.18.15.1"
+      "c-definition": "7.18.15.1"
     },
     "stdc_bit_ceil": {
-      "defined": "7.18.16.1"
+      "c-definition": "7.18.16.1"
     }
   },
   "functions": {
     "stdc_leading_zeros_uc": {
-      "defined": "7.18.3"
+      "c-definition": "7.18.3"
     },
     "stdc_leading_zeros_us": {
-      "defined": "7.18.3"
+      "c-definition": "7.18.3"
     },
     "stdc_leading_zeros_ui": {
-      "defined": "7.18.3"
+      "c-definition": "7.18.3"
     },
     "stdc_leading_zeros_ul": {
-      "defined": "7.18.3"
+      "c-definition": "7.18.3"
     },
     "stdc_leading_zeros_ull": {
-      "defined": "7.18.3"
+      "c-definition": "7.18.3"
     },
     "stdc_leading_ones_uc": {
-      "defined": "7.18.4"
+      "c-definition": "7.18.4"
     },
     "stdc_leading_ones_us": {
-      "defined": "7.18.4"
+      "c-definition": "7.18.4"
     },
     "stdc_leading_ones_ui": {
-      "defined": "7.18.4"
+      "c-definition": "7.18.4"
     },
     "stdc_leading_ones_ul": {
-      "defined": "7.18.4"
+      "c-definition": "7.18.4"
     },
     "stdc_leading_ones_ull": {
-      "defined": "7.18.4"
+      "c-definition": "7.18.4"
     },
     "stdc_trailing_zeros_uc": {
-      "defined": "7.18.5"
+      "c-definition": "7.18.5"
     },
     "stdc_trailing_zeros_us": {
-      "defined": "7.18.5"
+      "c-definition": "7.18.5"
     },
     "stdc_trailing_zeros_ui": {
-      "defined": "7.18.5"
+      "c-definition": "7.18.5"
     },
     "stdc_trailing_zeros_ul": {
-      "defined": "7.18.5"
+      "c-definition": "7.18.5"
     },
     "stdc_trailing_zeros_ull": {
-      "defined": "7.18.5"
+      "c-definition": "7.18.5"
     },
     "stdc_trailing_ones_uc": {
-      "defined": "7.18.6"
+      "c-definition": "7.18.6"
     },
     "stdc_trailing_ones_us": {
-      "defined": "7.18.6"
+      "c-definition": "7.18.6"
     },
     "stdc_trailing_ones_ui": {
-      "defined": "7.18.6"
+      "c-definition": "7.18.6"
     },
     "stdc_trailing_ones_ul": {
-      "defined": "7.18.6"
+      "c-definition": "7.18.6"
     },
     "stdc_trailing_ones_ull": {
-      "defined": "7.18.6"
+      "c-definition": "7.18.6"
     },
     "stdc_first_leading_zero_uc": {
-      "defined": "7.18.7"
+      "c-definition": "7.18.7"
     },
     "stdc_first_leading_zero_us": {
-      "defined": "7.18.7"
+      "c-definition": "7.18.7"
     },
     "stdc_first_leading_zero_ui": {
-      "defined": "7.18.7"
+      "c-definition": "7.18.7"
     },
     "stdc_first_leading_zero_ul": {
-      "defined": "7.18.7"
+      "c-definition": "7.18.7"
     },
     "stdc_first_leading_zero_ull": {
-      "defined": "7.18.7"
+      "c-definition": "7.18.7"
     },
     "stdc_first_leading_one_uc": {
-      "defined": "7.18.8"
+      "c-definition": "7.18.8"
     },
     "stdc_first_leading_one_us": {
-      "defined": "7.18.8"
+      "c-definition": "7.18.8"
     },
     "stdc_first_leading_one_ui": {
-      "defined": "7.18.8"
+      "c-definition": "7.18.8"
     },
     "stdc_first_leading_one_ul": {
-      "defined": "7.18.8"
+      "c-definition": "7.18.8"
     },
     "stdc_first_leading_one_ull": {
-      "defined": "7.18.8"
+      "c-definition": "7.18.8"
     },
     "stdc_first_trailing_zero_uc": {
-      "defined": "7.18.9"
+      "c-definition": "7.18.9"
     },
     "stdc_first_trailing_zero_us": {
-      "defined": "7.18.9"
+      "c-definition": "7.18.9"
     },
     "stdc_first_trailing_zero_ui": {
-      "defined": "7.18.9"
+      "c-definition": "7.18.9"
     },
     "stdc_first_trailing_zero_ul": {
-      "defined": "7.18.9"
+      "c-definition": "7.18.9"
     },
     "stdc_first_trailing_zero_ull": {
-      "defined": "7.18.9"
+      "c-definition": "7.18.9"
     },
     "stdc_first_trailing_one_uc": {
-      "defined": "7.18.10"
+      "c-definition": "7.18.10"
     },
     "stdc_first_trailing_one_us": {
-      "defined": "7.18.10"
+      "c-definition": "7.18.10"
     },
     "stdc_first_trailing_one_ui": {
-      "defined": "7.18.10"
+      "c-definition": "7.18.10"
     },
     "stdc_first_trailing_one_ul": {
-      "defined": "7.18.10"
+      "c-definition": "7.18.10"
     },
     "stdc_first_trailing_one_ull": {
-      "defined": "7.18.10"
+      "c-definition": "7.18.10"
     },
     "stdc_count_zeros_uc": {
-      "defined": "7.18.11"
+      "c-definition": "7.18.11"
     },
     "stdc_count_zeros_us": {
-      "defined": "7.18.11"
+      "c-definition": "7.18.11"
     },
     "stdc_count_zeros_ui": {
-      "defined": "7.18.11"
+      "c-definition": "7.18.11"
     },
     "stdc_count_zeros_ul": {
-      "defined": "7.18.11"
+      "c-definition": "7.18.11"
     },
     "stdc_count_zeros_ull": {
-      "defined": "7.18.11"
+      "c-definition": "7.18.11"
     },
     "stdc_count_ones_uc": {
-      "defined": "7.18.12"
+      "c-definition": "7.18.12"
     },
     "stdc_count_ones_us": {
-      "defined": "7.18.12"
+      "c-definition": "7.18.12"
     },
     "stdc_count_ones_ui": {
-      "defined": "7.18.12"
+      "c-definition": "7.18.12"
     },
     "stdc_count_ones_ul": {
-      "defined": "7.18.12"
+      "c-definition": "7.18.12"
     },
     "stdc_count_ones_ull": {
-      "defined": "7.18.12"
+      "c-definition": "7.18.12"
     },
     "stdc_has_single_bit_uc": {
-      "defined": "7.18.13"
+      "c-definition": "7.18.13"
     },
     "stdc_has_single_bit_us": {
-      "defined": "7.18.13"
+      "c-definition": "7.18.13"
     },
     "stdc_has_single_bit_ui": {
-      "defined": "7.18.13"
+      "c-definition": "7.18.13"
     },
     "stdc_has_single_bit_ul": {
-      "defined": "7.18.13"
+      "c-definition": "7.18.13"
     },
     "stdc_has_single_bit_ull": {
-      "defined": "7.18.13"
+      "c-definition": "7.18.13"
     },
     "stdc_bit_width_uc": {
-      "defined": "7.18.14"
+      "c-definition": "7.18.14"
     },
     "stdc_bit_width_us": {
-      "defined": "7.18.14"
+      "c-definition": "7.18.14"
     },
     "stdc_bit_width_ui": {
-      "defined": "7.18.14"
+      "c-definition": "7.18.14"
     },
     "stdc_bit_width_ul": {
-      "defined": "7.18.14"
+      "c-definition": "7.18.14"
     },
     "stdc_bit_width_ull": {
-      "defined": "7.18.14"
+      "c-definition": "7.18.14"
     },
     "stdc_bit_floor_uc": {
-      "defined": "7.18.15"
+      "c-definition": "7.18.15"
     },
     "stdc_bit_floor_us": {
-      "defined": "7.18.15"
+      "c-definition": "7.18.15"
     },
     "stdc_bit_floor_ui": {
-      "defined": "7.18.15"
+      "c-definition": "7.18.15"
     },
     "stdc_bit_floor_ul": {
-      "defined": "7.18.15"
+      "c-definition": "7.18.15"
     },
     "stdc_bit_floor_ull": {
-      "defined": "7.18.15"
+      "c-definition": "7.18.15"
     },
     "stdc_bit_ceil_uc": {
-      "defined": "7.18.16"
+      "c-definition": "7.18.16"
     },
     "stdc_bit_ceil_us": {
-      "defined": "7.18.16"
+      "c-definition": "7.18.16"
     },
     "stdc_bit_ceil_ui": {
-      "defined": "7.18.16"
+      "c-definition": "7.18.16"
     },
     "stdc_bit_ceil_ul": {
-      "defined": "7.18.16"
+      "c-definition": "7.18.16"
     },
     "stdc_bit_ceil_ull": {
-      "defined": "7.18.16"
+      "c-definition": "7.18.16"
     }
   }
 }
diff --git a/libc/utils/docgen/threads.json b/libc/utils/docgen/threads.json
index aef6ffaf75ba..8591cbde55a4 100644
--- a/libc/utils/docgen/threads.json
+++ b/libc/utils/docgen/threads.json
@@ -1,87 +1,87 @@
 {
   "macros": {
     "ONCE_FLAG_INIT": {
-      "defined": "7.28.1.3"
+      "c-definition": "7.28.1.3"
     },
     "TSS_DTOR_ITERATIONS": {
-      "defined": "7.28.1.3"
+      "c-definition": "7.28.1.3"
     }
   },
   "functions": {
     "call_once": {
-      "defined": "7.28.2.1"
+      "c-definition": "7.28.2.1"
     },
     "cnd_broadcast": {
-      "defined": "7.28.3.1"
+      "c-definition": "7.28.3.1"
     },
     "cnd_destroy": {
-      "defined": "7.28.3.2"
+      "c-definition": "7.28.3.2"
     },
     "cnd_init": {
-      "defined": "7.28.3.3"
+      "c-definition": "7.28.3.3"
     },
     "cnd_signal": {
-      "defined": "7.28.3.4"
+      "c-definition": "7.28.3.4"
     },
     "cnd_timedwait": {
-      "defined": "7.28.3.5"
+      "c-definition": "7.28.3.5"
     },
     "cnd_wait": {
-      "defined": "7.28.3.6"
+      "c-definition": "7.28.3.6"
     },
     "mtx_destroy": {
-      "defined": "7.28.4.1"
+      "c-definition": "7.28.4.1"
     },
     "mtx_init": {
-      "defined": "7.28.4.2"
+      "c-definition": "7.28.4.2"
     },
     "mtx_lock": {
-      "defined": "7.28.4.3"
+      "c-definition": "7.28.4.3"
     },
     "mtx_timedlock": {
-      "defined": "7.28.4.4"
+      "c-definition": "7.28.4.4"
     },
     "mtx_trylock": {
-      "defined": "7.28.4.5"
+      "c-definition": "7.28.4.5"
     },
     "mtx_unlock": {
-      "defined": "7.28.4.6"
+      "c-definition": "7.28.4.6"
     },
     "thrd_create": {
-      "defined": "7.28.5.1"
+      "c-definition": "7.28.5.1"
     },
     "thrd_current": {
-      "defined": "7.28.5.2"
+      "c-definition": "7.28.5.2"
     },
     "thrd_detach": {
-      "defined": "7.28.5.3"
+      "c-definition": "7.28.5.3"
     },
     "thrd_equal": {
-      "defined": "7.28.5.4"
+      "c-definition": "7.28.5.4"
     },
     "thrd_exit": {
-      "defined": "7.28.5.5"
+      "c-definition": "7.28.5.5"
     },
     "thrd_join": {
-      "defined": "7.28.5.6"
+      "c-definition": "7.28.5.6"
     },
     "thrd_sleep": {
-      "defined": "7.28.5.7"
+      "c-definition": "7.28.5.7"
     },
     "thrd_yield": {
-      "defined": "7.28.5.8"
+      "c-definition": "7.28.5.8"
     },
     "tss_create": {
-      "defined": "7.28.6.1"
+      "c-definition": "7.28.6.1"
     },
     "tss_delete": {
-      "defined": "7.28.6.2"
+      "c-definition": "7.28.6.2"
     },
     "tss_get": {
-      "defined": "7.28.6.3"
+      "c-definition": "7.28.6.3"
     },
     "tss_set": {
-      "defined": "7.28.6.4"
+      "c-definition": "7.28.6.4"
     }
   }
 }
diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index 83fcd40bb80c..0bc343acd281 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -53,6 +53,7 @@ Implemented Papers
 - P2387R3 - Pipe support for user-defined range adaptors
 - P2713R1 - Escaping improvements in ``std::format``
 - P2231R1 - Missing ``constexpr`` in ``std::optional`` and ``std::variant``
+- P0019R8 - ``std::atomic_ref``
 
 Improvements and New Features
 -----------------------------
diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv
index db57b15256a6..5f83fa3a92e8 100644
--- a/libcxx/docs/Status/Cxx20Issues.csv
+++ b/libcxx/docs/Status/Cxx20Issues.csv
@@ -70,7 +70,7 @@
 "`3041 <https://wg21.link/LWG3041>`__","Unnecessary ``decay``\  in ``reference_wrapper``\ ","Jacksonville","|Complete|",""
 "`3042 <https://wg21.link/LWG3042>`__","``is_literal_type_v``\  should be inline","Jacksonville","|Complete|",""
 "`3043 <https://wg21.link/LWG3043>`__","Bogus postcondition for ``filesystem_error``\  constructor","Jacksonville","|Complete|",""
-"`3045 <https://wg21.link/LWG3045>`__","``atomic<floating-point>``\  doesn't have ``value_type``\  or ``difference_type``\ ","Jacksonville","",""
+"`3045 <https://wg21.link/LWG3045>`__","``atomic<floating-point>``\  doesn't have ``value_type``\  or ``difference_type``\ ","Jacksonville","|Complete|","18.0"
 "`3048 <https://wg21.link/LWG3048>`__","``transform_reduce(exec, first1, last1, first2, init)``\  discards execution policy","Jacksonville","|Complete|","17.0"
 "`3051 <https://wg21.link/LWG3051>`__","Floating point classifications were inadvertently changed in P0175","Jacksonville","|Nothing To Do|",""
 "`3075 <https://wg21.link/LWG3075>`__","``basic_string``\  needs deduction guides from ``basic_string_view``\ ","Jacksonville","|Complete|",""
diff --git a/libcxx/docs/Status/Cxx20Papers.csv b/libcxx/docs/Status/Cxx20Papers.csv
index 955aa5f614af..6598cd18358f 100644
--- a/libcxx/docs/Status/Cxx20Papers.csv
+++ b/libcxx/docs/Status/Cxx20Papers.csv
@@ -26,7 +26,7 @@
 "`P0905R1 <https://wg21.link/P0905R1>`__","CWG","Symmetry for spaceship","Jacksonville","|Complete|","7.0","|spaceship|"
 "`P0966R1 <https://wg21.link/P0966R1>`__","LWG","``string::reserve``\  Should Not Shrink","Jacksonville","|Complete| [#note-P0966]_","12.0"
 "","","","","","",""
-"`P0019R8 <https://wg21.link/P0019R8>`__","LWG","Atomic Ref","Rapperswil","",""
+"`P0019R8 <https://wg21.link/P0019R8>`__","LWG","Atomic Ref","Rapperswil","|Complete|","19.0"
 "`P0458R2 <https://wg21.link/P0458R2>`__","LWG","Checking for Existence of an Element in Associative Containers","Rapperswil","|Complete|","13.0"
 "`P0475R1 <https://wg21.link/P0475R1>`__","LWG","LWG 2511: guaranteed copy elision for piecewise construction","Rapperswil","|Complete|",""
 "`P0476R2 <https://wg21.link/P0476R2>`__","LWG","Bit-casting object representations","Rapperswil","|Complete|","14.0"
@@ -125,7 +125,7 @@
 "`P1612R1 <https://wg21.link/P1612R1>`__","LWG","Relocate Endian's Specification","Cologne","|Complete|","10.0"
 "`P1614R2 <https://wg21.link/P1614R2>`__","LWG","The Mothership has Landed","Cologne","|In Progress|",""
 "`P1638R1 <https://wg21.link/P1638R1>`__","LWG","basic_istream_view::iterator should not be copyable","Cologne","|Complete|","16.0","|ranges|"
-"`P1643R1 <https://wg21.link/P1643R1>`__","LWG","Add wait/notify to atomic_ref","Cologne","",""
+"`P1643R1 <https://wg21.link/P1643R1>`__","LWG","Add wait/notify to atomic_ref","Cologne","|Complete|","19.0"
 "`P1644R0 <https://wg21.link/P1644R0>`__","LWG","Add wait/notify to atomic<shared_ptr>","Cologne","",""
 "`P1650R0 <https://wg21.link/P1650R0>`__","LWG","Output std::chrono::days with 'd' suffix","Cologne","|Complete|","16.0"
 "`P1651R0 <https://wg21.link/P1651R0>`__","LWG","bind_front should not unwrap reference_wrapper","Cologne","|Complete|","13.0"
diff --git a/libcxx/docs/Status/Cxx23Issues.csv b/libcxx/docs/Status/Cxx23Issues.csv
index d421feef8db9..cc601b3cd3c9 100644
--- a/libcxx/docs/Status/Cxx23Issues.csv
+++ b/libcxx/docs/Status/Cxx23Issues.csv
@@ -98,7 +98,7 @@
 `3555 <https://wg21.link/LWG3555>`__,"``{transform,elements}_view::iterator::iterator_concept`` should consider const-qualification of the underlying range","June 2021","","","|ranges|"
 "","","","","",""
 `2191 <https://wg21.link/LWG2191>`__,"Incorrect specification of ``match_results(match_results&&)``","October 2021","|Nothing To Do|",""
-`2381 <https://wg21.link/LWG2381>`__,"Inconsistency in parsing floating point numbers","October 2021","",""
+`2381 <https://wg21.link/LWG2381>`__,"Inconsistency in parsing floating point numbers","October 2021","|Complete|","19.0"
 `2762 <https://wg21.link/LWG2762>`__,"``unique_ptr operator*()`` should be ``noexcept``","October 2021","",""
 `3121 <https://wg21.link/LWG3121>`__,"``tuple`` constructor constraints for ``UTypes&&...`` overloads","October 2021","",""
 `3123 <https://wg21.link/LWG3123>`__,"``duration`` constructor from representation shouldn't be effectively non-throwing","October 2021","","","|chrono|"
diff --git a/libcxx/docs/Status/ParallelismProjects.csv b/libcxx/docs/Status/ParallelismProjects.csv
index 06da008ac5fe..2ddac1e52f02 100644
--- a/libcxx/docs/Status/ParallelismProjects.csv
+++ b/libcxx/docs/Status/ParallelismProjects.csv
@@ -24,6 +24,7 @@ Section,Description,Dependencies,Assignee,Complete
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "`simd generate constructor <https://reviews.llvm.org/D159442>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "`simd load constructor <https://github.com/llvm/llvm-project/pull/76610>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "`simd subscript operators <https://github.com/llvm/llvm-project/pull/68960>`_", None, Yin Zhang, |Complete|
+| `[parallel.simd.class] <https://wg21.link/N4808>`_, "`simd copy functions <https://github.com/llvm/llvm-project/pull/78935>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "Class template simd implementation", None, Yin Zhang, |In Progress|
 | `[parallel.simd.nonmembers] <https://wg21.link/N4808>`_, "simd non-member operations", None, Yin Zhang, |In Progress|
 | `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "`Class template simd_mask declaration and alias <https://reviews.llvm.org/D144362>`_", [parallel.simd.abi], Yin Zhang, |Complete|
@@ -33,5 +34,6 @@ Section,Description,Dependencies,Assignee,Complete
 | `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "`simd_mask implicit type conversion constructor <https://github.com/llvm/llvm-project/pull/71132>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "`simd_mask load constructor <https://github.com/llvm/llvm-project/pull/76610>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "`simd_mask subscript operators <https://github.com/llvm/llvm-project/pull/68960>`_", None, Yin Zhang, |Complete|
+| `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "`simd_mask copy functions <https://github.com/llvm/llvm-project/pull/78935>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "Class template simd_mask implementation", None, Yin Zhang, |In Progress|
 | `[parallel.simd.mask.nonmembers] <https://wg21.link/N4808>`_, "simd_mask non-member operations", None, Yin Zhang, |In Progress|
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 01e9c247560c..954e0c04ec85 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -224,6 +224,7 @@ set(files
   __atomic/atomic_flag.h
   __atomic/atomic_init.h
   __atomic/atomic_lock_free.h
+  __atomic/atomic_ref.h
   __atomic/atomic_sync.h
   __atomic/check_memory_order.h
   __atomic/contention_t.h
@@ -232,6 +233,7 @@ set(files
   __atomic/is_always_lock_free.h
   __atomic/kill_dependency.h
   __atomic/memory_order.h
+  __atomic/to_gcc_order.h
   __availability
   __bit/bit_cast.h
   __bit/bit_ceil.h
diff --git a/libcxx/include/__algorithm/copy_move_common.h b/libcxx/include/__algorithm/copy_move_common.h
index 12a26c6d6a64..8a98451a8f96 100644
--- a/libcxx/include/__algorithm/copy_move_common.h
+++ b/libcxx/include/__algorithm/copy_move_common.h
@@ -21,7 +21,6 @@
 #include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_constructible.h>
 #include <__type_traits/is_trivially_assignable.h>
-#include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/is_volatile.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
diff --git a/libcxx/include/__algorithm/pstl_copy.h b/libcxx/include/__algorithm/pstl_copy.h
index 0fcea33c3919..3e17131f1356 100644
--- a/libcxx/include/__algorithm/pstl_copy.h
+++ b/libcxx/include/__algorithm/pstl_copy.h
@@ -20,7 +20,6 @@
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_execution_policy.h>
-#include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/remove_cvref.h>
 #include <__utility/move.h>
 #include <optional>
@@ -95,10 +94,12 @@ template <class _ExecutionPolicy,
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_copy_n, _RawPolicy),
       [&__policy](
           _ForwardIterator __g_first, _Size __g_n, _ForwardOutIterator __g_result) -> optional<_ForwardIterator> {
-        if constexpr (__has_random_access_iterator_category_or_concept<_ForwardIterator>::value)
+        if constexpr (__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
           return std::__copy(__policy, std::move(__g_first), std::move(__g_first + __g_n), std::move(__g_result));
-        else
+        } else {
+          (void)__policy;
           return std::copy_n(__g_first, __g_n, __g_result);
+        }
       },
       std::move(__first),
       std::move(__n),
diff --git a/libcxx/include/__algorithm/pstl_count.h b/libcxx/include/__algorithm/pstl_count.h
index 64c84d855e4f..65c96b2f06de 100644
--- a/libcxx/include/__algorithm/pstl_count.h
+++ b/libcxx/include/__algorithm/pstl_count.h
@@ -87,8 +87,8 @@ template <class _ExecutionPolicy,
           class _Tp,
           class _RawPolicy                                    = __remove_cvref_t<_ExecutionPolicy>,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
-[[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__iter_diff_t<_ForwardIterator>>
-__count(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__iter_diff_t<_ForwardIterator>> __count(
+    _ExecutionPolicy&& __policy, _ForwardIterator&& __first, _ForwardIterator&& __last, const _Tp& __value) noexcept {
   return std::__pstl_frontend_dispatch(
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_count, _RawPolicy),
       [&](_ForwardIterator __g_first, _ForwardIterator __g_last, const _Tp& __g_value)
@@ -97,8 +97,8 @@ __count(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator
           return __v == __g_value;
         });
       },
-      std::move(__first),
-      std::move(__last),
+      std::forward<_ForwardIterator>(__first),
+      std::forward<_ForwardIterator>(__last),
       __value);
 }
 
diff --git a/libcxx/include/__algorithm/pstl_equal.h b/libcxx/include/__algorithm/pstl_equal.h
index 0b38197d7f63..47333daaac88 100644
--- a/libcxx/include/__algorithm/pstl_equal.h
+++ b/libcxx/include/__algorithm/pstl_equal.h
@@ -91,7 +91,10 @@ _LIBCPP_HIDE_FROM_ABI bool
 equal(_ExecutionPolicy&& __policy, _ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2) {
   _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator1, "equal requires ForwardIterators");
   _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator2, "equal requires ForwardIterators");
-  return std::equal(__policy, std::move(__first1), std::move(__last1), std::move(__first2), std::equal_to{});
+  auto __res = std::__equal(__policy, std::move(__first1), std::move(__last1), std::move(__first2), std::equal_to{});
+  if (!__res)
+    std::__throw_bad_alloc();
+  return *__res;
 }
 
 template <class _ExecutionPolicy,
@@ -171,8 +174,11 @@ equal(_ExecutionPolicy&& __policy,
       _ForwardIterator2 __last2) {
   _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator1, "equal requires ForwardIterators");
   _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator2, "equal requires ForwardIterators");
-  return std::equal(
+  auto __res = std::__equal(
       __policy, std::move(__first1), std::move(__last1), std::move(__first2), std::move(__last2), std::equal_to{});
+  if (!__res)
+    std::__throw_bad_alloc();
+  return *__res;
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/pstl_fill.h b/libcxx/include/__algorithm/pstl_fill.h
index fd248506bc4b..1032d77af8a0 100644
--- a/libcxx/include/__algorithm/pstl_fill.h
+++ b/libcxx/include/__algorithm/pstl_fill.h
@@ -41,8 +41,8 @@ template <class _ExecutionPolicy,
           class _Tp,
           class _RawPolicy                                    = __remove_cvref_t<_ExecutionPolicy>,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
-_LIBCPP_HIDE_FROM_ABI optional<__empty>
-__fill(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) noexcept {
+_LIBCPP_HIDE_FROM_ABI optional<__empty> __fill(
+    _ExecutionPolicy&& __policy, _ForwardIterator&& __first, _ForwardIterator&& __last, const _Tp& __value) noexcept {
   return std::__pstl_frontend_dispatch(
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_fill, _RawPolicy),
       [&](_ForwardIterator __g_first, _ForwardIterator __g_last, const _Tp& __g_value) {
@@ -50,8 +50,8 @@ __fill(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator _
           __element = __g_value;
         });
       },
-      std::move(__first),
-      std::move(__last),
+      std::forward<_ForwardIterator>(__first),
+      std::forward<_ForwardIterator>(__last),
       __value);
 }
 
diff --git a/libcxx/include/__algorithm/pstl_find.h b/libcxx/include/__algorithm/pstl_find.h
index b4c4dfb2ffb6..998db70fd644 100644
--- a/libcxx/include/__algorithm/pstl_find.h
+++ b/libcxx/include/__algorithm/pstl_find.h
@@ -65,8 +65,8 @@ template <class _ExecutionPolicy,
           class _Predicate,
           class _RawPolicy                                    = __remove_cvref_t<_ExecutionPolicy>,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
-[[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__remove_cvref_t<_ForwardIterator>>
-__find_if_not(_ExecutionPolicy&& __policy, _ForwardIterator&& __first, _ForwardIterator&& __last, _Predicate&& __pred) {
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__remove_cvref_t<_ForwardIterator>> __find_if_not(
+    _ExecutionPolicy&& __policy, _ForwardIterator&& __first, _ForwardIterator&& __last, _Predicate&& __pred) noexcept {
   return std::__pstl_frontend_dispatch(
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_find_if_not, _RawPolicy),
       [&](_ForwardIterator&& __g_first, _ForwardIterator&& __g_last, _Predicate&& __g_pred)
@@ -76,9 +76,9 @@ __find_if_not(_ExecutionPolicy&& __policy, _ForwardIterator&& __first, _ForwardI
               return !__g_pred(__value);
             });
       },
-      std::move(__first),
-      std::move(__last),
-      std::move(__pred));
+      std::forward<_ForwardIterator>(__first),
+      std::forward<_ForwardIterator>(__last),
+      std::forward<_Predicate>(__pred));
 }
 
 template <class _ExecutionPolicy,
@@ -103,8 +103,8 @@ template <class _ExecutionPolicy,
           class _Tp,
           class _RawPolicy                                    = __remove_cvref_t<_ExecutionPolicy>,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
-[[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__remove_cvref_t<_ForwardIterator>>
-__find(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) noexcept {
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__remove_cvref_t<_ForwardIterator>> __find(
+    _ExecutionPolicy&& __policy, _ForwardIterator&& __first, _ForwardIterator&& __last, const _Tp& __value) noexcept {
   return std::__pstl_frontend_dispatch(
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_find, _RawPolicy),
       [&](_ForwardIterator __g_first, _ForwardIterator __g_last, const _Tp& __g_value) -> optional<_ForwardIterator> {
@@ -113,8 +113,8 @@ __find(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator _
               return __element == __g_value;
             });
       },
-      std::move(__first),
-      std::move(__last),
+      std::forward<_ForwardIterator>(__first),
+      std::forward<_ForwardIterator>(__last),
       __value);
 }
 
diff --git a/libcxx/include/__algorithm/pstl_generate.h b/libcxx/include/__algorithm/pstl_generate.h
index 350c0e4798be..78e4dd81e644 100644
--- a/libcxx/include/__algorithm/pstl_generate.h
+++ b/libcxx/include/__algorithm/pstl_generate.h
@@ -40,8 +40,8 @@ template <class _ExecutionPolicy,
           class _Generator,
           class _RawPolicy                                    = __remove_cvref_t<_ExecutionPolicy>,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
-[[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__empty>
-__generate(_ExecutionPolicy&& __policy, _ForwardIterator&& __first, _ForwardIterator&& __last, _Generator&& __gen) {
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__empty> __generate(
+    _ExecutionPolicy&& __policy, _ForwardIterator&& __first, _ForwardIterator&& __last, _Generator&& __gen) noexcept {
   return std::__pstl_frontend_dispatch(
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_generate, _RawPolicy),
       [&__policy](_ForwardIterator __g_first, _ForwardIterator __g_last, _Generator __g_gen) {
@@ -77,7 +77,7 @@ template <class _ExecutionPolicy,
           class _RawPolicy                                    = __remove_cvref_t<_ExecutionPolicy>,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__empty>
-__generate_n(_ExecutionPolicy&& __policy, _ForwardIterator&& __first, _Size&& __n, _Generator&& __gen) {
+__generate_n(_ExecutionPolicy&& __policy, _ForwardIterator&& __first, _Size&& __n, _Generator&& __gen) noexcept {
   return std::__pstl_frontend_dispatch(
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_generate_n, _RawPolicy),
       [&__policy](_ForwardIterator __g_first, _Size __g_n, _Generator __g_gen) {
diff --git a/libcxx/include/__algorithm/pstl_is_partitioned.h b/libcxx/include/__algorithm/pstl_is_partitioned.h
index 2dd5cf3ca2a2..068502e7ed11 100644
--- a/libcxx/include/__algorithm/pstl_is_partitioned.h
+++ b/libcxx/include/__algorithm/pstl_is_partitioned.h
@@ -41,7 +41,7 @@ template <class _ExecutionPolicy,
           class _RawPolicy                                    = __remove_cvref_t<_ExecutionPolicy>,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<bool> __is_partitioned(
-    _ExecutionPolicy&& __policy, _ForwardIterator&& __first, _ForwardIterator&& __last, _Predicate&& __pred) {
+    _ExecutionPolicy&& __policy, _ForwardIterator&& __first, _ForwardIterator&& __last, _Predicate&& __pred) noexcept {
   return std::__pstl_frontend_dispatch(
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_is_partitioned, _RawPolicy),
       [&__policy](_ForwardIterator __g_first, _ForwardIterator __g_last, _Predicate __g_pred) {
diff --git a/libcxx/include/__algorithm/pstl_merge.h b/libcxx/include/__algorithm/pstl_merge.h
index 87f634a67f58..f76a281f4c45 100644
--- a/libcxx/include/__algorithm/pstl_merge.h
+++ b/libcxx/include/__algorithm/pstl_merge.h
@@ -16,6 +16,7 @@
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/remove_cvref.h>
+#include <__utility/forward.h>
 #include <__utility/move.h>
 #include <optional>
 
@@ -34,26 +35,26 @@ template <class _ExecutionPolicy,
           class _ForwardIterator1,
           class _ForwardIterator2,
           class _ForwardOutIterator,
-          class _Comp                                         = std::less<>,
+          class _Comp,
           class _RawPolicy                                    = __remove_cvref_t<_ExecutionPolicy>,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator>
 __merge(_ExecutionPolicy&&,
-        _ForwardIterator1 __first1,
-        _ForwardIterator1 __last1,
-        _ForwardIterator2 __first2,
-        _ForwardIterator2 __last2,
-        _ForwardOutIterator __result,
-        _Comp __comp = {}) noexcept {
+        _ForwardIterator1&& __first1,
+        _ForwardIterator1&& __last1,
+        _ForwardIterator2&& __first2,
+        _ForwardIterator2&& __last2,
+        _ForwardOutIterator&& __result,
+        _Comp&& __comp) noexcept {
   using _Backend = typename __select_backend<_RawPolicy>::type;
   return std::__pstl_merge<_RawPolicy>(
       _Backend{},
-      std::move(__first1),
-      std::move(__last1),
-      std::move(__first2),
-      std::move(__last2),
-      std::move(__result),
-      std::move(__comp));
+      std::forward<_ForwardIterator1>(__first1),
+      std::forward<_ForwardIterator1>(__last1),
+      std::forward<_ForwardIterator2>(__first2),
+      std::forward<_ForwardIterator2>(__last2),
+      std::forward<_ForwardOutIterator>(__result),
+      std::forward<_Comp>(__comp));
 }
 
 template <class _ExecutionPolicy,
diff --git a/libcxx/include/__algorithm/pstl_move.h b/libcxx/include/__algorithm/pstl_move.h
index 3155ddedf91b..745fdefb9b11 100644
--- a/libcxx/include/__algorithm/pstl_move.h
+++ b/libcxx/include/__algorithm/pstl_move.h
@@ -20,7 +20,6 @@
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_execution_policy.h>
-#include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/remove_cvref.h>
 #include <optional>
 
diff --git a/libcxx/include/__algorithm/pstl_replace.h b/libcxx/include/__algorithm/pstl_replace.h
index b2ded54dfe25..456df216b19d 100644
--- a/libcxx/include/__algorithm/pstl_replace.h
+++ b/libcxx/include/__algorithm/pstl_replace.h
@@ -91,8 +91,8 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __replace(_ExecutionPolicy&& __policy,
-          _ForwardIterator __first,
-          _ForwardIterator __last,
+          _ForwardIterator&& __first,
+          _ForwardIterator&& __last,
           const _Tp& __old_value,
           const _Tp& __new_value) noexcept {
   return std::__pstl_frontend_dispatch(
@@ -106,8 +106,8 @@ __replace(_ExecutionPolicy&& __policy,
             [&](__iter_reference<_ForwardIterator> __element) { return __element == __g_old_value; },
             __g_new_value);
       },
-      std::move(__first),
-      std::move(__last),
+      std::forward<_ForwardIterator>(__first),
+      std::forward<_ForwardIterator>(__last),
       __old_value,
       __new_value);
 }
@@ -144,7 +144,7 @@ template <class _ExecutionPolicy,
     _ForwardIterator&& __last,
     _ForwardOutIterator&& __result,
     _Pred&& __pred,
-    const _Tp& __new_value) {
+    const _Tp& __new_value) noexcept {
   return std::__pstl_frontend_dispatch(
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_replace_copy_if, _RawPolicy),
       [&__policy](_ForwardIterator __g_first,
diff --git a/libcxx/include/__algorithm/pstl_sort.h b/libcxx/include/__algorithm/pstl_sort.h
index 769dd81af77e..1b978b227276 100644
--- a/libcxx/include/__algorithm/pstl_sort.h
+++ b/libcxx/include/__algorithm/pstl_sort.h
@@ -41,17 +41,20 @@ template <class _ExecutionPolicy,
           class _Comp,
           class _RawPolicy                                    = __remove_cvref_t<_ExecutionPolicy>,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
-[[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__empty> __sort(
-    _ExecutionPolicy&& __policy, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) noexcept {
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__empty>
+__sort(_ExecutionPolicy&& __policy,
+       _RandomAccessIterator&& __first,
+       _RandomAccessIterator&& __last,
+       _Comp&& __comp) noexcept {
   return std::__pstl_frontend_dispatch(
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_sort, _RawPolicy),
       [&__policy](_RandomAccessIterator __g_first, _RandomAccessIterator __g_last, _Comp __g_comp) {
         std::stable_sort(__policy, std::move(__g_first), std::move(__g_last), std::move(__g_comp));
         return optional<__empty>{__empty{}};
       },
-      std::move(__first),
-      std::move(__last),
-      std::move(__comp));
+      std::forward<_RandomAccessIterator>(__first),
+      std::forward<_RandomAccessIterator>(__last),
+      std::forward<_Comp>(__comp));
 }
 
 template <class _ExecutionPolicy,
@@ -73,7 +76,8 @@ template <class _ExecutionPolicy,
 _LIBCPP_HIDE_FROM_ABI void
 sort(_ExecutionPolicy&& __policy, _RandomAccessIterator __first, _RandomAccessIterator __last) {
   _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(_RandomAccessIterator, "sort requires RandomAccessIterators");
-  std::sort(std::forward<_ExecutionPolicy>(__policy), std::move(__first), std::move(__last), less{});
+  if (!std::__sort(__policy, std::move(__first), std::move(__last), less{}))
+    std::__throw_bad_alloc();
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__atomic/atomic_ref.h b/libcxx/include/__atomic/atomic_ref.h
new file mode 100644
index 000000000000..156f1961151c
--- /dev/null
+++ b/libcxx/include/__atomic/atomic_ref.h
@@ -0,0 +1,360 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ATOMIC_ATOMIC_REF_H
+#define _LIBCPP___ATOMIC_ATOMIC_REF_H
+
+#include <__assert>
+#include <__atomic/atomic_sync.h>
+#include <__atomic/check_memory_order.h>
+#include <__atomic/to_gcc_order.h>
+#include <__concepts/arithmetic.h>
+#include <__concepts/same_as.h>
+#include <__config>
+#include <__memory/addressof.h>
+#include <__type_traits/has_unique_object_representation.h>
+#include <__type_traits/is_trivially_copyable.h>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if _LIBCPP_STD_VER >= 20
+
+template <class _Tp>
+struct __atomic_ref_base {
+protected:
+  _Tp* __ptr_;
+
+  _LIBCPP_HIDE_FROM_ABI __atomic_ref_base(_Tp& __obj) : __ptr_(std::addressof(__obj)) {}
+
+private:
+  _LIBCPP_HIDE_FROM_ABI static _Tp* __clear_padding(_Tp& __val) noexcept {
+    _Tp* __ptr = std::addressof(__val);
+#  if __has_builtin(__builtin_clear_padding)
+    __builtin_clear_padding(__ptr);
+#  endif
+    return __ptr;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI static bool __compare_exchange(
+      _Tp* __ptr, _Tp* __expected, _Tp* __desired, bool __is_weak, int __success, int __failure) noexcept {
+    if constexpr (
+#  if __has_builtin(__builtin_clear_padding)
+        has_unique_object_representations_v<_Tp> || floating_point<_Tp>
+#  else
+        true // NOLINT(readability-simplify-boolean-expr)
+#  endif
+    ) {
+      return __atomic_compare_exchange(__ptr, __expected, __desired, __is_weak, __success, __failure);
+    } else { // _Tp has padding bits and __builtin_clear_padding is available
+      __clear_padding(*__desired);
+      _Tp __copy = *__expected;
+      __clear_padding(__copy);
+      // The algorithm we use here is basically to perform `__atomic_compare_exchange` on the
+      // values until it has either succeeded, or failed because the value representation of the
+      // objects involved was different. This is why we loop around __atomic_compare_exchange:
+      // we basically loop until its failure is caused by the value representation of the objects
+      // being different, not only their object representation.
+      while (true) {
+        _Tp __prev = __copy;
+        if (__atomic_compare_exchange(__ptr, std::addressof(__copy), __desired, __is_weak, __success, __failure)) {
+          return true;
+        }
+        _Tp __curr = __copy;
+        if (std::memcmp(__clear_padding(__prev), __clear_padding(__curr), sizeof(_Tp)) != 0) {
+          // Value representation without padding bits do not compare equal ->
+          // write the current content of *ptr into *expected
+          std::memcpy(__expected, std::addressof(__copy), sizeof(_Tp));
+          return false;
+        }
+      }
+    }
+  }
+
+  friend struct __atomic_waitable_traits<__atomic_ref_base<_Tp>>;
+
+public:
+  using value_type = _Tp;
+
+  static constexpr size_t required_alignment = alignof(_Tp);
+
+  // The __atomic_always_lock_free builtin takes into account the alignment of the pointer if provided,
+  // so we create a fake pointer with a suitable alignment when querying it. Note that we are guaranteed
+  // that the pointer is going to be aligned properly at runtime because that is a (checked) precondition
+  // of atomic_ref's constructor.
+  static constexpr bool is_always_lock_free =
+      __atomic_always_lock_free(sizeof(_Tp), reinterpret_cast<void*>(-required_alignment));
+
+  _LIBCPP_HIDE_FROM_ABI bool is_lock_free() const noexcept { return __atomic_is_lock_free(sizeof(_Tp), __ptr_); }
+
+  _LIBCPP_HIDE_FROM_ABI void store(_Tp __desired, memory_order __order = memory_order::seq_cst) const noexcept
+      _LIBCPP_CHECK_STORE_MEMORY_ORDER(__order) {
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        __order == memory_order::relaxed || __order == memory_order::release || __order == memory_order::seq_cst,
+        "atomic_ref: memory order argument to atomic store operation is invalid");
+    __atomic_store(__ptr_, __clear_padding(__desired), std::__to_gcc_order(__order));
+  }
+
+  _LIBCPP_HIDE_FROM_ABI _Tp operator=(_Tp __desired) const noexcept {
+    store(__desired);
+    return __desired;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI _Tp load(memory_order __order = memory_order::seq_cst) const noexcept
+      _LIBCPP_CHECK_LOAD_MEMORY_ORDER(__order) {
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        __order == memory_order::relaxed || __order == memory_order::consume || __order == memory_order::acquire ||
+            __order == memory_order::seq_cst,
+        "atomic_ref: memory order argument to atomic load operation is invalid");
+    alignas(_Tp) byte __mem[sizeof(_Tp)];
+    auto* __ret = reinterpret_cast<_Tp*>(__mem);
+    __atomic_load(__ptr_, __ret, std::__to_gcc_order(__order));
+    return *__ret;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI operator _Tp() const noexcept { return load(); }
+
+  _LIBCPP_HIDE_FROM_ABI _Tp exchange(_Tp __desired, memory_order __order = memory_order::seq_cst) const noexcept {
+    alignas(_Tp) byte __mem[sizeof(_Tp)];
+    auto* __ret = reinterpret_cast<_Tp*>(__mem);
+    __atomic_exchange(__ptr_, __clear_padding(__desired), __ret, std::__to_gcc_order(__order));
+    return *__ret;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI bool
+  compare_exchange_weak(_Tp& __expected, _Tp __desired, memory_order __success, memory_order __failure) const noexcept
+      _LIBCPP_CHECK_EXCHANGE_MEMORY_ORDER(__success, __failure) {
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        __failure == memory_order::relaxed || __failure == memory_order::consume ||
+            __failure == memory_order::acquire || __failure == memory_order::seq_cst,
+        "atomic_ref: failure memory order argument to weak atomic compare-and-exchange operation is invalid");
+    return __compare_exchange(
+        __ptr_,
+        std::addressof(__expected),
+        std::addressof(__desired),
+        true,
+        std::__to_gcc_order(__success),
+        std::__to_gcc_order(__failure));
+  }
+  _LIBCPP_HIDE_FROM_ABI bool
+  compare_exchange_strong(_Tp& __expected, _Tp __desired, memory_order __success, memory_order __failure) const noexcept
+      _LIBCPP_CHECK_EXCHANGE_MEMORY_ORDER(__success, __failure) {
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        __failure == memory_order::relaxed || __failure == memory_order::consume ||
+            __failure == memory_order::acquire || __failure == memory_order::seq_cst,
+        "atomic_ref: failure memory order argument to strong atomic compare-and-exchange operation is invalid");
+    return __compare_exchange(
+        __ptr_,
+        std::addressof(__expected),
+        std::addressof(__desired),
+        false,
+        std::__to_gcc_order(__success),
+        std::__to_gcc_order(__failure));
+  }
+
+  _LIBCPP_HIDE_FROM_ABI bool
+  compare_exchange_weak(_Tp& __expected, _Tp __desired, memory_order __order = memory_order::seq_cst) const noexcept {
+    return __compare_exchange(
+        __ptr_,
+        std::addressof(__expected),
+        std::addressof(__desired),
+        true,
+        std::__to_gcc_order(__order),
+        std::__to_gcc_failure_order(__order));
+  }
+  _LIBCPP_HIDE_FROM_ABI bool
+  compare_exchange_strong(_Tp& __expected, _Tp __desired, memory_order __order = memory_order::seq_cst) const noexcept {
+    return __compare_exchange(
+        __ptr_,
+        std::addressof(__expected),
+        std::addressof(__desired),
+        false,
+        std::__to_gcc_order(__order),
+        std::__to_gcc_failure_order(__order));
+  }
+
+  _LIBCPP_HIDE_FROM_ABI void wait(_Tp __old, memory_order __order = memory_order::seq_cst) const noexcept
+      _LIBCPP_CHECK_WAIT_MEMORY_ORDER(__order) {
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        __order == memory_order::relaxed || __order == memory_order::consume || __order == memory_order::acquire ||
+            __order == memory_order::seq_cst,
+        "atomic_ref: memory order argument to atomic wait operation is invalid");
+    std::__atomic_wait(*this, __old, __order);
+  }
+  _LIBCPP_HIDE_FROM_ABI void notify_one() const noexcept { std::__atomic_notify_one(*this); }
+  _LIBCPP_HIDE_FROM_ABI void notify_all() const noexcept { std::__atomic_notify_all(*this); }
+};
+
+template <class _Tp>
+struct __atomic_waitable_traits<__atomic_ref_base<_Tp>> {
+  static _LIBCPP_HIDE_FROM_ABI _Tp __atomic_load(const __atomic_ref_base<_Tp>& __a, memory_order __order) {
+    return __a.load(__order);
+  }
+  static _LIBCPP_HIDE_FROM_ABI const _Tp* __atomic_contention_address(const __atomic_ref_base<_Tp>& __a) {
+    return __a.__ptr_;
+  }
+};
+
+template <class _Tp>
+struct atomic_ref : public __atomic_ref_base<_Tp> {
+  static_assert(is_trivially_copyable_v<_Tp>, "std::atomic_ref<T> requires that 'T' be a trivially copyable type");
+
+  using __base = __atomic_ref_base<_Tp>;
+
+  _LIBCPP_HIDE_FROM_ABI explicit atomic_ref(_Tp& __obj) : __base(__obj) {
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        reinterpret_cast<uintptr_t>(std::addressof(__obj)) % __base::required_alignment == 0,
+        "atomic_ref ctor: referenced object must be aligned to required_alignment");
+  }
+
+  _LIBCPP_HIDE_FROM_ABI atomic_ref(const atomic_ref&) noexcept = default;
+
+  _LIBCPP_HIDE_FROM_ABI _Tp operator=(_Tp __desired) const noexcept { return __base::operator=(__desired); }
+
+  atomic_ref& operator=(const atomic_ref&) = delete;
+};
+
+template <class _Tp>
+  requires(std::integral<_Tp> && !std::same_as<bool, _Tp>)
+struct atomic_ref<_Tp> : public __atomic_ref_base<_Tp> {
+  using __base = __atomic_ref_base<_Tp>;
+
+  using difference_type = __base::value_type;
+
+  _LIBCPP_HIDE_FROM_ABI explicit atomic_ref(_Tp& __obj) : __base(__obj) {
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        reinterpret_cast<uintptr_t>(std::addressof(__obj)) % __base::required_alignment == 0,
+        "atomic_ref ctor: referenced object must be aligned to required_alignment");
+  }
+
+  _LIBCPP_HIDE_FROM_ABI atomic_ref(const atomic_ref&) noexcept = default;
+
+  _LIBCPP_HIDE_FROM_ABI _Tp operator=(_Tp __desired) const noexcept { return __base::operator=(__desired); }
+
+  atomic_ref& operator=(const atomic_ref&) = delete;
+
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_add(_Tp __arg, memory_order __order = memory_order_seq_cst) const noexcept {
+    return __atomic_fetch_add(this->__ptr_, __arg, std::__to_gcc_order(__order));
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_sub(_Tp __arg, memory_order __order = memory_order_seq_cst) const noexcept {
+    return __atomic_fetch_sub(this->__ptr_, __arg, std::__to_gcc_order(__order));
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_and(_Tp __arg, memory_order __order = memory_order_seq_cst) const noexcept {
+    return __atomic_fetch_and(this->__ptr_, __arg, std::__to_gcc_order(__order));
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_or(_Tp __arg, memory_order __order = memory_order_seq_cst) const noexcept {
+    return __atomic_fetch_or(this->__ptr_, __arg, std::__to_gcc_order(__order));
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_xor(_Tp __arg, memory_order __order = memory_order_seq_cst) const noexcept {
+    return __atomic_fetch_xor(this->__ptr_, __arg, std::__to_gcc_order(__order));
+  }
+
+  _LIBCPP_HIDE_FROM_ABI _Tp operator++(int) const noexcept { return fetch_add(_Tp(1)); }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator--(int) const noexcept { return fetch_sub(_Tp(1)); }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator++() const noexcept { return fetch_add(_Tp(1)) + _Tp(1); }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator--() const noexcept { return fetch_sub(_Tp(1)) - _Tp(1); }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator+=(_Tp __arg) const noexcept { return fetch_add(__arg) + __arg; }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator-=(_Tp __arg) const noexcept { return fetch_sub(__arg) - __arg; }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator&=(_Tp __arg) const noexcept { return fetch_and(__arg) & __arg; }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator|=(_Tp __arg) const noexcept { return fetch_or(__arg) | __arg; }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator^=(_Tp __arg) const noexcept { return fetch_xor(__arg) ^ __arg; }
+};
+
+template <class _Tp>
+  requires std::floating_point<_Tp>
+struct atomic_ref<_Tp> : public __atomic_ref_base<_Tp> {
+  using __base = __atomic_ref_base<_Tp>;
+
+  using difference_type = __base::value_type;
+
+  _LIBCPP_HIDE_FROM_ABI explicit atomic_ref(_Tp& __obj) : __base(__obj) {
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        reinterpret_cast<uintptr_t>(std::addressof(__obj)) % __base::required_alignment == 0,
+        "atomic_ref ctor: referenced object must be aligned to required_alignment");
+  }
+
+  _LIBCPP_HIDE_FROM_ABI atomic_ref(const atomic_ref&) noexcept = default;
+
+  _LIBCPP_HIDE_FROM_ABI _Tp operator=(_Tp __desired) const noexcept { return __base::operator=(__desired); }
+
+  atomic_ref& operator=(const atomic_ref&) = delete;
+
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_add(_Tp __arg, memory_order __order = memory_order_seq_cst) const noexcept {
+    _Tp __old = this->load(memory_order_relaxed);
+    _Tp __new = __old + __arg;
+    while (!this->compare_exchange_weak(__old, __new, __order, memory_order_relaxed)) {
+      __new = __old + __arg;
+    }
+    return __old;
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp fetch_sub(_Tp __arg, memory_order __order = memory_order_seq_cst) const noexcept {
+    _Tp __old = this->load(memory_order_relaxed);
+    _Tp __new = __old - __arg;
+    while (!this->compare_exchange_weak(__old, __new, __order, memory_order_relaxed)) {
+      __new = __old - __arg;
+    }
+    return __old;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI _Tp operator+=(_Tp __arg) const noexcept { return fetch_add(__arg) + __arg; }
+  _LIBCPP_HIDE_FROM_ABI _Tp operator-=(_Tp __arg) const noexcept { return fetch_sub(__arg) - __arg; }
+};
+
+template <class _Tp>
+struct atomic_ref<_Tp*> : public __atomic_ref_base<_Tp*> {
+  using __base = __atomic_ref_base<_Tp*>;
+
+  using difference_type = ptrdiff_t;
+
+  _LIBCPP_HIDE_FROM_ABI explicit atomic_ref(_Tp*& __ptr) : __base(__ptr) {}
+
+  _LIBCPP_HIDE_FROM_ABI _Tp* operator=(_Tp* __desired) const noexcept { return __base::operator=(__desired); }
+
+  atomic_ref& operator=(const atomic_ref&) = delete;
+
+  _LIBCPP_HIDE_FROM_ABI _Tp* fetch_add(ptrdiff_t __arg, memory_order __order = memory_order_seq_cst) const noexcept {
+    return __atomic_fetch_add(this->__ptr_, __arg * sizeof(_Tp), std::__to_gcc_order(__order));
+  }
+  _LIBCPP_HIDE_FROM_ABI _Tp* fetch_sub(ptrdiff_t __arg, memory_order __order = memory_order_seq_cst) const noexcept {
+    return __atomic_fetch_sub(this->__ptr_, __arg * sizeof(_Tp), std::__to_gcc_order(__order));
+  }
+
+  _LIBCPP_HIDE_FROM_ABI _Tp* operator++(int) const noexcept { return fetch_add(1); }
+  _LIBCPP_HIDE_FROM_ABI _Tp* operator--(int) const noexcept { return fetch_sub(1); }
+  _LIBCPP_HIDE_FROM_ABI _Tp* operator++() const noexcept { return fetch_add(1) + 1; }
+  _LIBCPP_HIDE_FROM_ABI _Tp* operator--() const noexcept { return fetch_sub(1) - 1; }
+  _LIBCPP_HIDE_FROM_ABI _Tp* operator+=(ptrdiff_t __arg) const noexcept { return fetch_add(__arg) + __arg; }
+  _LIBCPP_HIDE_FROM_ABI _Tp* operator-=(ptrdiff_t __arg) const noexcept { return fetch_sub(__arg) - __arg; }
+};
+
+_LIBCPP_CTAD_SUPPORTED_FOR_TYPE(atomic_ref);
+
+#endif // _LIBCPP_STD_VER >= 20
+
+_LIBCPP_END_NAMESPACE_STD
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP__ATOMIC_ATOMIC_REF_H
diff --git a/libcxx/include/__atomic/atomic_sync.h b/libcxx/include/__atomic/atomic_sync.h
index e583dca38c4c..175700be54c0 100644
--- a/libcxx/include/__atomic/atomic_sync.h
+++ b/libcxx/include/__atomic/atomic_sync.h
@@ -12,6 +12,7 @@
 #include <__atomic/contention_t.h>
 #include <__atomic/cxx_atomic_impl.h>
 #include <__atomic/memory_order.h>
+#include <__atomic/to_gcc_order.h>
 #include <__availability>
 #include <__chrono/duration.h>
 #include <__config>
diff --git a/libcxx/include/__atomic/check_memory_order.h b/libcxx/include/__atomic/check_memory_order.h
index 3012aec0521b..536f764a6190 100644
--- a/libcxx/include/__atomic/check_memory_order.h
+++ b/libcxx/include/__atomic/check_memory_order.h
@@ -27,4 +27,8 @@
   _LIBCPP_DIAGNOSE_WARNING(__f == memory_order_release || __f == memory_order_acq_rel,                                 \
                            "memory order argument to atomic operation is invalid")
 
+#define _LIBCPP_CHECK_WAIT_MEMORY_ORDER(__m)                                                                           \
+  _LIBCPP_DIAGNOSE_WARNING(__m == memory_order_release || __m == memory_order_acq_rel,                                 \
+                           "memory order argument to atomic operation is invalid")
+
 #endif // _LIBCPP___ATOMIC_CHECK_MEMORY_ORDER_H
diff --git a/libcxx/include/__atomic/cxx_atomic_impl.h b/libcxx/include/__atomic/cxx_atomic_impl.h
index b900cc135f78..18e88aa97bec 100644
--- a/libcxx/include/__atomic/cxx_atomic_impl.h
+++ b/libcxx/include/__atomic/cxx_atomic_impl.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___ATOMIC_CXX_ATOMIC_IMPL_H
 
 #include <__atomic/memory_order.h>
+#include <__atomic/to_gcc_order.h>
 #include <__config>
 #include <__memory/addressof.h>
 #include <__type_traits/is_assignable.h>
@@ -54,32 +55,6 @@ struct __cxx_atomic_base_impl {
   _Tp __a_value;
 };
 
-_LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR int __to_gcc_order(memory_order __order) {
-  // Avoid switch statement to make this a constexpr.
-  return __order == memory_order_relaxed
-           ? __ATOMIC_RELAXED
-           : (__order == memory_order_acquire
-                  ? __ATOMIC_ACQUIRE
-                  : (__order == memory_order_release
-                         ? __ATOMIC_RELEASE
-                         : (__order == memory_order_seq_cst
-                                ? __ATOMIC_SEQ_CST
-                                : (__order == memory_order_acq_rel ? __ATOMIC_ACQ_REL : __ATOMIC_CONSUME))));
-}
-
-_LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR int __to_gcc_failure_order(memory_order __order) {
-  // Avoid switch statement to make this a constexpr.
-  return __order == memory_order_relaxed
-           ? __ATOMIC_RELAXED
-           : (__order == memory_order_acquire
-                  ? __ATOMIC_ACQUIRE
-                  : (__order == memory_order_release
-                         ? __ATOMIC_RELAXED
-                         : (__order == memory_order_seq_cst
-                                ? __ATOMIC_SEQ_CST
-                                : (__order == memory_order_acq_rel ? __ATOMIC_ACQUIRE : __ATOMIC_CONSUME))));
-}
-
 template <typename _Tp>
 _LIBCPP_HIDE_FROM_ABI void __cxx_atomic_init(volatile __cxx_atomic_base_impl<_Tp>* __a, _Tp __val) {
   __cxx_atomic_assign_volatile(__a->__a_value, __val);
diff --git a/libcxx/include/__atomic/to_gcc_order.h b/libcxx/include/__atomic/to_gcc_order.h
new file mode 100644
index 000000000000..d04c111addd3
--- /dev/null
+++ b/libcxx/include/__atomic/to_gcc_order.h
@@ -0,0 +1,54 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ATOMIC_TO_GCC_ORDER_H
+#define _LIBCPP___ATOMIC_TO_GCC_ORDER_H
+
+#include <__atomic/memory_order.h>
+#include <__config>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if defined(__ATOMIC_RELAXED) && defined(__ATOMIC_CONSUME) && defined(__ATOMIC_ACQUIRE) &&                             \
+    defined(__ATOMIC_RELEASE) && defined(__ATOMIC_ACQ_REL) && defined(__ATOMIC_SEQ_CST)
+
+_LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR int __to_gcc_order(memory_order __order) {
+  // Avoid switch statement to make this a constexpr.
+  return __order == memory_order_relaxed
+           ? __ATOMIC_RELAXED
+           : (__order == memory_order_acquire
+                  ? __ATOMIC_ACQUIRE
+                  : (__order == memory_order_release
+                         ? __ATOMIC_RELEASE
+                         : (__order == memory_order_seq_cst
+                                ? __ATOMIC_SEQ_CST
+                                : (__order == memory_order_acq_rel ? __ATOMIC_ACQ_REL : __ATOMIC_CONSUME))));
+}
+
+_LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR int __to_gcc_failure_order(memory_order __order) {
+  // Avoid switch statement to make this a constexpr.
+  return __order == memory_order_relaxed
+           ? __ATOMIC_RELAXED
+           : (__order == memory_order_acquire
+                  ? __ATOMIC_ACQUIRE
+                  : (__order == memory_order_release
+                         ? __ATOMIC_RELAXED
+                         : (__order == memory_order_seq_cst
+                                ? __ATOMIC_SEQ_CST
+                                : (__order == memory_order_acq_rel ? __ATOMIC_ACQUIRE : __ATOMIC_CONSUME))));
+}
+
+#endif
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___ATOMIC_TO_GCC_ORDER_H
diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h
index c9027de9238c..868fd7c01533 100644
--- a/libcxx/include/__exception/exception_ptr.h
+++ b/libcxx/include/__exception/exception_ptr.h
@@ -38,11 +38,14 @@ struct __cxa_exception;
 _LIBCPP_OVERRIDABLE_FUNC_VIS __cxa_exception* __cxa_init_primary_exception(
     void*,
     std::type_info*,
-    void(
 #    if defined(_WIN32)
-        __thiscall
+    void(__thiscall*)(void*)) throw();
+#    elif defined(__wasm__)
+    // In Wasm, a destructor returns its argument
+    void* (*)(void*)) throw();
+#    else
+    void (*)(void*)) throw();
 #    endif
-            *)(void*)) throw();
 }
 
 } // namespace __cxxabiv1
@@ -92,8 +95,16 @@ _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep __e) _NOEXCEPT {
   using _Ep2 = __decay_t<_Ep>;
 
   void* __ex = __cxxabiv1::__cxa_allocate_exception(sizeof(_Ep));
+#      ifdef __wasm__
+  // In Wasm, a destructor returns its argument
+  (void)__cxxabiv1::__cxa_init_primary_exception(__ex, const_cast<std::type_info*>(&typeid(_Ep)), [](void* __p) -> void* {
+#      else
   (void)__cxxabiv1::__cxa_init_primary_exception(__ex, const_cast<std::type_info*>(&typeid(_Ep)), [](void* __p) {
+#      endif
     std::__destroy_at(static_cast<_Ep2*>(__p));
+#      ifdef __wasm__
+    return __p;
+#      endif
   });
 
   try {
diff --git a/libcxx/include/__locale b/libcxx/include/__locale
index 36ac099d650e..1e97c7594c8b 100644
--- a/libcxx/include/__locale
+++ b/libcxx/include/__locale
@@ -343,12 +343,12 @@ public:
   static const mask __regex_word = 0x4000; // 0x8000 and 0x0100 and 0x00ff are used
 #  define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_PRINT
 #  define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_ALPHA
-#elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__EMSCRIPTEN__) || defined(__NetBSD__)
+#elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__)
 #  ifdef __APPLE__
   typedef __uint32_t mask;
 #  elif defined(__FreeBSD__)
   typedef unsigned long mask;
-#  elif defined(__EMSCRIPTEN__) || defined(__NetBSD__)
+#  elif defined(__NetBSD__)
   typedef unsigned short mask;
 #  endif
   static const mask space  = _CTYPE_S;
diff --git a/libcxx/include/atomic b/libcxx/include/atomic
index cb142b09bff3..80a0f9ee373e 100644
--- a/libcxx/include/atomic
+++ b/libcxx/include/atomic
@@ -599,6 +599,7 @@ template <class T>
 #include <__atomic/atomic_flag.h>
 #include <__atomic/atomic_init.h>
 #include <__atomic/atomic_lock_free.h>
+#include <__atomic/atomic_ref.h>
 #include <__atomic/atomic_sync.h>
 #include <__atomic/check_memory_order.h>
 #include <__atomic/contention_t.h>
diff --git a/libcxx/include/experimental/__simd/scalar.h b/libcxx/include/experimental/__simd/scalar.h
index aff2cd11cfcf..1add4653209a 100644
--- a/libcxx/include/experimental/__simd/scalar.h
+++ b/libcxx/include/experimental/__simd/scalar.h
@@ -62,6 +62,11 @@ struct __simd_operations<_Tp, simd_abi::__scalar> {
   static _LIBCPP_HIDE_FROM_ABI void __load(_SimdStorage& __s, const _Up* __mem) noexcept {
     __s.__data = static_cast<_Tp>(__mem[0]);
   }
+
+  template <class _Up>
+  static _LIBCPP_HIDE_FROM_ABI void __store(_SimdStorage __s, _Up* __mem) noexcept {
+    *__mem = static_cast<_Up>(__s.__data);
+  }
 };
 
 template <class _Tp>
@@ -71,6 +76,8 @@ struct __mask_operations<_Tp, simd_abi::__scalar> {
   static _LIBCPP_HIDE_FROM_ABI _MaskStorage __broadcast(bool __v) noexcept { return {__v}; }
 
   static _LIBCPP_HIDE_FROM_ABI void __load(_MaskStorage& __s, const bool* __mem) noexcept { __s.__data = __mem[0]; }
+
+  static _LIBCPP_HIDE_FROM_ABI void __store(_MaskStorage __s, bool* __mem) noexcept { __mem[0] = __s.__data; }
 };
 
 } // namespace parallelism_v2
diff --git a/libcxx/include/experimental/__simd/simd.h b/libcxx/include/experimental/__simd/simd.h
index db4ebb8e4a38..37e334aad6da 100644
--- a/libcxx/include/experimental/__simd/simd.h
+++ b/libcxx/include/experimental/__simd/simd.h
@@ -70,6 +70,17 @@ public:
     _Impl::__load(__s_, _Flags::template __apply<simd>(__mem));
   }
 
+  // copy functions
+  template <class _Up, class _Flags, enable_if_t<__is_vectorizable_v<_Up> && is_simd_flag_type_v<_Flags>, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI void copy_from(const _Up* __mem, _Flags) {
+    _Impl::__load(__s_, _Flags::template __apply<simd>(__mem));
+  }
+
+  template <class _Up, class _Flags, enable_if_t<__is_vectorizable_v<_Up> && is_simd_flag_type_v<_Flags>, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI void copy_to(_Up* __mem, _Flags) const {
+    _Impl::__store(__s_, _Flags::template __apply<simd>(__mem));
+  }
+
   // scalar access [simd.subscr]
   _LIBCPP_HIDE_FROM_ABI reference operator[](size_t __i) noexcept { return reference(__s_, __i); }
   _LIBCPP_HIDE_FROM_ABI value_type operator[](size_t __i) const noexcept { return __s_.__get(__i); }
diff --git a/libcxx/include/experimental/__simd/simd_mask.h b/libcxx/include/experimental/__simd/simd_mask.h
index 754db7992683..fd6dee2e28ee 100644
--- a/libcxx/include/experimental/__simd/simd_mask.h
+++ b/libcxx/include/experimental/__simd/simd_mask.h
@@ -58,6 +58,17 @@ public:
     _Impl::__load(__s_, _Flags::template __apply<simd_mask>(__mem));
   }
 
+  // copy functions
+  template <class _Flags, enable_if_t<is_simd_flag_type_v<_Flags>, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI void copy_from(const value_type* __mem, _Flags) {
+    _Impl::__load(__s_, _Flags::template __apply<simd_mask>(__mem));
+  }
+
+  template <class _Flags, enable_if_t<is_simd_flag_type_v<_Flags>, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI void copy_to(value_type* __mem, _Flags) const {
+    _Impl::__store(__s_, _Flags::template __apply<simd_mask>(__mem));
+  }
+
   // scalar access [simd.mask.subscr]
   _LIBCPP_HIDE_FROM_ABI reference operator[](size_t __i) noexcept { return reference(__s_, __i); }
   _LIBCPP_HIDE_FROM_ABI value_type operator[](size_t __i) const noexcept { return __s_.__get(__i); }
diff --git a/libcxx/include/experimental/__simd/vec_ext.h b/libcxx/include/experimental/__simd/vec_ext.h
index c9423df93cfa..316866b84873 100644
--- a/libcxx/include/experimental/__simd/vec_ext.h
+++ b/libcxx/include/experimental/__simd/vec_ext.h
@@ -80,6 +80,12 @@ struct __simd_operations<_Tp, simd_abi::__vec_ext<_Np>> {
     for (size_t __i = 0; __i < _Np; __i++)
       __s.__data[__i] = static_cast<_Tp>(__mem[__i]);
   }
+
+  template <class _Up>
+  static _LIBCPP_HIDE_FROM_ABI void __store(_SimdStorage __s, _Up* __mem) noexcept {
+    for (size_t __i = 0; __i < _Np; __i++)
+      __mem[__i] = static_cast<_Up>(__s.__data[__i]);
+  }
 };
 
 template <class _Tp, int _Np>
@@ -99,6 +105,11 @@ struct __mask_operations<_Tp, simd_abi::__vec_ext<_Np>> {
     for (size_t __i = 0; __i < _Np; __i++)
       __s.__data[__i] = experimental::__set_all_bits<_Tp>(__mem[__i]);
   }
+
+  static _LIBCPP_HIDE_FROM_ABI void __store(_MaskStorage __s, bool* __mem) noexcept {
+    for (size_t __i = 0; __i < _Np; __i++)
+      __mem[__i] = static_cast<bool>(__s.__data[__i]);
+  }
 };
 
 } // namespace parallelism_v2
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index 5a7521eed410..80dd49fe3d75 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -554,7 +554,6 @@ protected:
     return __guard.__release_ptr();
   }
 
-  template <class... _Args>
   _LIBCPP_HIDE_FROM_ABI void __delete_node(__node_pointer __node) {
     // For the same reason as above, we use the allocator's destroy() method for the value_type,
     // but not for the node itself.
diff --git a/libcxx/include/libcxx.imp b/libcxx/include/libcxx.imp
new file mode 100644
index 000000000000..f6aa1ea6b62b
--- /dev/null
+++ b/libcxx/include/libcxx.imp
@@ -0,0 +1,869 @@
+[
+  { include: [ "<__algorithm/adjacent_find.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/all_of.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/any_of.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/binary_search.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/clamp.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/comp.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/comp_ref_type.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/copy_backward.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/copy_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/copy_move_common.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/copy_n.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/count.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/count_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/equal.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/equal_range.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/fill.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/fill_n.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/find.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/find_end.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/find_first_of.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/find_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/find_if_not.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/find_segment_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/fold.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/for_each.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/for_each_n.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/for_each_segment.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/generate.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/generate_n.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/half_positive.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/in_found_result.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/in_fun_result.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/in_in_out_result.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/in_in_result.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/in_out_out_result.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/in_out_result.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/includes.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/inplace_merge.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/is_heap.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/is_heap_until.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/is_partitioned.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/is_permutation.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/is_sorted.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/is_sorted_until.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/iter_swap.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/iterator_operations.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/lexicographical_compare.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/lexicographical_compare_three_way.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/lower_bound.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/make_heap.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/make_projected.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/max.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/max_element.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/merge.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/min.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/min_element.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/min_max_result.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/minmax.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/minmax_element.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/mismatch.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/move.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/move_backward.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/next_permutation.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/none_of.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/nth_element.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/partial_sort.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/partial_sort_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/partition.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/partition_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/partition_point.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pop_heap.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/prev_permutation.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_any_all_none_of.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backend.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backend.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/any_of.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/backend.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/fill.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/find_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/for_each.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/libdispatch.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/merge.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/serial.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/stable_sort.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/thread.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/transform.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_backends/cpu_backends/transform_reduce.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_count.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_equal.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_fill.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_find.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_for_each.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_frontend_dispatch.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_generate.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_is_partitioned.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_merge.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_move.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_replace.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_rotate_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_sort.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_stable_sort.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/pstl_transform.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/push_heap.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_adjacent_find.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_all_of.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_any_of.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_binary_search.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_clamp.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_contains.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_contains_subrange.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_copy_backward.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_copy_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_copy_n.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_count.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_count_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_ends_with.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_equal.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_equal_range.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_fill.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_fill_n.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_find.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_find_end.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_find_first_of.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_find_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_find_if_not.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_for_each.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_for_each_n.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_generate.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_generate_n.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_includes.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_inplace_merge.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_is_heap.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_is_heap_until.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_is_partitioned.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_is_permutation.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_is_sorted.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_is_sorted_until.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_iterator_concept.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_lexicographical_compare.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_lower_bound.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_make_heap.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_max.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_max_element.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_merge.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_min.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_min_element.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_minmax.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_minmax_element.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_mismatch.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_move.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_move_backward.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_next_permutation.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_none_of.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_nth_element.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_partial_sort.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_partial_sort_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_partition.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_partition_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_partition_point.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_pop_heap.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_prev_permutation.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_push_heap.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_remove.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_remove_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_remove_copy_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_remove_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_replace.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_replace_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_replace_copy_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_replace_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_reverse.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_reverse_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_rotate.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_rotate_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_sample.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_search.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_search_n.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_set_difference.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_set_intersection.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_set_symmetric_difference.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_set_union.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_shuffle.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_sort.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_sort_heap.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_stable_partition.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_stable_sort.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_starts_with.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_swap_ranges.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_transform.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_unique.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_unique_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/ranges_upper_bound.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/remove.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/remove_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/remove_copy_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/remove_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/replace.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/replace_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/replace_copy_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/replace_if.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/reverse.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/reverse_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/rotate.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/rotate_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/sample.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/search.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/search_n.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/set_difference.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/set_intersection.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/set_symmetric_difference.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/set_union.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/shift_left.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/shift_right.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/shuffle.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/sift_down.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/simd_utils.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/sort.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/sort_heap.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/stable_partition.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/stable_sort.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/swap_ranges.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/three_way_comp_ref_type.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/transform.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/uniform_random_bit_generator_adaptor.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/unique.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/unique_copy.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/unwrap_iter.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/unwrap_range.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__algorithm/upper_bound.h>", "private", "<algorithm>", "public" ] },
+  { include: [ "<__atomic/aliases.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/atomic.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/atomic_base.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/atomic_flag.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/atomic_init.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/atomic_lock_free.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/atomic_ref.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/atomic_sync.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/check_memory_order.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/contention_t.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/cxx_atomic_impl.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/fence.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/is_always_lock_free.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/kill_dependency.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/memory_order.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__atomic/to_gcc_order.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__bit/bit_cast.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/bit_ceil.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/bit_floor.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/bit_log2.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/bit_width.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/blsr.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/byteswap.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/countl.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/countr.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/endian.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/has_single_bit.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/invert_if.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/popcount.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__bit/rotate.h>", "private", "<bit>", "public" ] },
+  { include: [ "<__charconv/chars_format.h>", "private", "<charconv>", "public" ] },
+  { include: [ "<__charconv/from_chars_integral.h>", "private", "<charconv>", "public" ] },
+  { include: [ "<__charconv/from_chars_result.h>", "private", "<charconv>", "public" ] },
+  { include: [ "<__charconv/tables.h>", "private", "<charconv>", "public" ] },
+  { include: [ "<__charconv/to_chars.h>", "private", "<charconv>", "public" ] },
+  { include: [ "<__charconv/to_chars_base_10.h>", "private", "<charconv>", "public" ] },
+  { include: [ "<__charconv/to_chars_floating_point.h>", "private", "<charconv>", "public" ] },
+  { include: [ "<__charconv/to_chars_integral.h>", "private", "<charconv>", "public" ] },
+  { include: [ "<__charconv/to_chars_result.h>", "private", "<charconv>", "public" ] },
+  { include: [ "<__charconv/traits.h>", "private", "<charconv>", "public" ] },
+  { include: [ "<__chrono/calendar.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/concepts.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/convert_to_timespec.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/convert_to_tm.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/day.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/duration.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/file_clock.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/formatter.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/hh_mm_ss.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/high_resolution_clock.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/leap_second.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/literals.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/month.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/month_weekday.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/monthday.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/ostream.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/parser_std_format_spec.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/statically_widen.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/steady_clock.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/sys_info.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/system_clock.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/time_point.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/time_zone.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/time_zone_link.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/tzdb.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/tzdb_list.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/weekday.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/year.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/year_month.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/year_month_day.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__chrono/year_month_weekday.h>", "private", "<chrono>", "public" ] },
+  { include: [ "<__compare/common_comparison_category.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/compare_partial_order_fallback.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/compare_strong_order_fallback.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/compare_three_way.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/compare_three_way_result.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/compare_weak_order_fallback.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/is_eq.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/ordering.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/partial_order.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/strong_order.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/synth_three_way.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/three_way_comparable.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__compare/weak_order.h>", "private", "<compare>", "public" ] },
+  { include: [ "<__concepts/arithmetic.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/assignable.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/boolean_testable.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/class_or_enum.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/common_reference_with.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/common_with.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/constructible.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/convertible_to.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/copyable.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/derived_from.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/destructible.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/different_from.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/equality_comparable.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/invocable.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/movable.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/predicate.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/regular.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/relation.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/same_as.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/semiregular.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/swappable.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__concepts/totally_ordered.h>", "private", "<concepts>", "public" ] },
+  { include: [ "<__condition_variable/condition_variable.h>", "private", "<condition_variable>", "public" ] },
+  { include: [ "<__coroutine/coroutine_handle.h>", "private", "<coroutine>", "public" ] },
+  { include: [ "<__coroutine/coroutine_traits.h>", "private", "<coroutine>", "public" ] },
+  { include: [ "<__coroutine/noop_coroutine_handle.h>", "private", "<coroutine>", "public" ] },
+  { include: [ "<__coroutine/trivial_awaitables.h>", "private", "<coroutine>", "public" ] },
+  { include: [ "<__exception/exception.h>", "private", "<exception>", "public" ] },
+  { include: [ "<__exception/exception_ptr.h>", "private", "<exception>", "public" ] },
+  { include: [ "<__exception/nested_exception.h>", "private", "<exception>", "public" ] },
+  { include: [ "<__exception/operations.h>", "private", "<exception>", "public" ] },
+  { include: [ "<__exception/terminate.h>", "private", "<exception>", "public" ] },
+  { include: [ "<__expected/bad_expected_access.h>", "private", "<expected>", "public" ] },
+  { include: [ "<__expected/expected.h>", "private", "<expected>", "public" ] },
+  { include: [ "<__expected/unexpect.h>", "private", "<expected>", "public" ] },
+  { include: [ "<__expected/unexpected.h>", "private", "<expected>", "public" ] },
+  { include: [ "<__filesystem/copy_options.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/directory_entry.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/directory_iterator.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/directory_options.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/file_status.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/file_time_type.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/file_type.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/filesystem_error.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/operations.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/path.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/path_iterator.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/perm_options.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/perms.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/recursive_directory_iterator.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/space_info.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__filesystem/u8path.h>", "private", "<filesystem>", "public" ] },
+  { include: [ "<__format/buffer.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/concepts.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/container_adaptor.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/enable_insertable.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/escaped_output_table.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/extended_grapheme_cluster_table.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/format_arg.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/format_arg_store.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/format_args.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/format_context.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/format_error.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/format_functions.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/format_parse_context.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/format_string.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/format_to_n_result.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/formatter.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/formatter_bool.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/formatter_char.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/formatter_floating_point.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/formatter_integer.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/formatter_integral.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/formatter_output.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/formatter_pointer.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/formatter_string.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/formatter_tuple.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/indic_conjunct_break_table.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/parser_std_format_spec.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/range_default_formatter.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/range_formatter.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/unicode.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/width_estimation_table.h>", "private", "<format>", "public" ] },
+  { include: [ "<__format/write_escaped.h>", "private", "<format>", "public" ] },
+  { include: [ "<__functional/binary_function.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/binary_negate.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/bind.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/bind_back.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/bind_front.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/binder1st.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/binder2nd.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/boyer_moore_searcher.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/compose.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/default_searcher.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/function.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/hash.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/identity.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/invoke.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/is_transparent.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/mem_fn.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/mem_fun_ref.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/not_fn.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/operations.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/perfect_forward.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/pointer_to_binary_function.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/pointer_to_unary_function.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/ranges_operations.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/reference_wrapper.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/unary_function.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/unary_negate.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__functional/weak_result_type.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__fwd/array.h>", "private", "<array>", "public" ] },
+  { include: [ "<__fwd/bit_reference.h>", "private", "<bitset>", "public" ] },
+  { include: [ "<__fwd/bit_reference.h>", "private", "<vector>", "public" ] },
+  { include: [ "<__fwd/complex.h>", "private", "<complex>", "public" ] },
+  { include: [ "<__fwd/deque.h>", "private", "<deque>", "public" ] },
+  { include: [ "<__fwd/format.h>", "private", "<format>", "public" ] },
+  { include: [ "<__fwd/fstream.h>", "private", "<iosfwd>", "public" ] },
+  { include: [ "<__fwd/functional.h>", "private", "<functional>", "public" ] },
+  { include: [ "<__fwd/ios.h>", "private", "<iosfwd>", "public" ] },
+  { include: [ "<__fwd/istream.h>", "private", "<iosfwd>", "public" ] },
+  { include: [ "<__fwd/mdspan.h>", "private", "<mdspan>", "public" ] },
+  { include: [ "<__fwd/memory.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__fwd/memory_resource.h>", "private", "<memory_resource>", "public" ] },
+  { include: [ "<__fwd/ostream.h>", "private", "<iosfwd>", "public" ] },
+  { include: [ "<__fwd/pair.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__fwd/queue.h>", "private", "<queue>", "public" ] },
+  { include: [ "<__fwd/span.h>", "private", "<span>", "public" ] },
+  { include: [ "<__fwd/sstream.h>", "private", "<iosfwd>", "public" ] },
+  { include: [ "<__fwd/stack.h>", "private", "<stack>", "public" ] },
+  { include: [ "<__fwd/streambuf.h>", "private", "<iosfwd>", "public" ] },
+  { include: [ "<__fwd/string.h>", "private", "<string>", "public" ] },
+  { include: [ "<__fwd/string_view.h>", "private", "<string_view>", "public" ] },
+  { include: [ "<__fwd/subrange.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__fwd/tuple.h>", "private", "<tuple>", "public" ] },
+  { include: [ "<__fwd/vector.h>", "private", "<vector>", "public" ] },
+  { include: [ "<__ios/fpos.h>", "private", "<ios>", "public" ] },
+  { include: [ "<__iterator/access.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/advance.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/back_insert_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/bounded_iter.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/common_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/concepts.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/counted_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/cpp17_iterator_concepts.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/data.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/default_sentinel.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/distance.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/empty.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/erase_if_container.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/front_insert_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/incrementable_traits.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/indirectly_comparable.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/insert_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/istream_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/istreambuf_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/iter_move.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/iter_swap.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/iterator_traits.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/iterator_with_data.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/mergeable.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/move_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/move_sentinel.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/next.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/ostream_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/ostreambuf_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/permutable.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/prev.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/projected.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/ranges_iterator_traits.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/readable_traits.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/reverse_access.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/reverse_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/segmented_iterator.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/size.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/sortable.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/unreachable_sentinel.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__iterator/wrap_iter.h>", "private", "<iterator>", "public" ] },
+  { include: [ "<__locale_dir/locale_base_api.h>", "private", "<locale>", "public" ] },
+  { include: [ "<__locale_dir/locale_base_api/android.h>", "private", "<locale>", "public" ] },
+  { include: [ "<__locale_dir/locale_base_api/bsd_locale_defaults.h>", "private", "<locale>", "public" ] },
+  { include: [ "<__locale_dir/locale_base_api/bsd_locale_fallbacks.h>", "private", "<locale>", "public" ] },
+  { include: [ "<__locale_dir/locale_base_api/fuchsia.h>", "private", "<locale>", "public" ] },
+  { include: [ "<__locale_dir/locale_base_api/ibm.h>", "private", "<locale>", "public" ] },
+  { include: [ "<__locale_dir/locale_base_api/locale_guard.h>", "private", "<locale>", "public" ] },
+  { include: [ "<__locale_dir/locale_base_api/musl.h>", "private", "<locale>", "public" ] },
+  { include: [ "<__locale_dir/locale_base_api/newlib.h>", "private", "<locale>", "public" ] },
+  { include: [ "<__locale_dir/locale_base_api/openbsd.h>", "private", "<locale>", "public" ] },
+  { include: [ "<__locale_dir/locale_base_api/win32.h>", "private", "<locale>", "public" ] },
+  { include: [ "<__math/abs.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/copysign.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/error_functions.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/exponential_functions.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/fdim.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/fma.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/gamma.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/hyperbolic_functions.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/hypot.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/inverse_hyperbolic_functions.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/inverse_trigonometric_functions.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/logarithms.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/min_max.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/modulo.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/remainder.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/roots.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/rounding_functions.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/traits.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__math/trigonometric_functions.h>", "private", "<cmath>", "public" ] },
+  { include: [ "<__mdspan/default_accessor.h>", "private", "<mdspan>", "public" ] },
+  { include: [ "<__mdspan/extents.h>", "private", "<mdspan>", "public" ] },
+  { include: [ "<__mdspan/layout_left.h>", "private", "<mdspan>", "public" ] },
+  { include: [ "<__mdspan/layout_right.h>", "private", "<mdspan>", "public" ] },
+  { include: [ "<__mdspan/layout_stride.h>", "private", "<mdspan>", "public" ] },
+  { include: [ "<__mdspan/mdspan.h>", "private", "<mdspan>", "public" ] },
+  { include: [ "<__memory/addressof.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/align.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/aligned_alloc.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/allocate_at_least.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/allocation_guard.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/allocator.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/allocator_arg_t.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/allocator_destructor.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/allocator_traits.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/assume_aligned.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/auto_ptr.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/builtin_new_allocator.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/compressed_pair.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/concepts.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/construct_at.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/destruct_n.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/pointer_traits.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/ranges_construct_at.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/ranges_uninitialized_algorithms.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/raw_storage_iterator.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/shared_ptr.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/swap_allocator.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/temp_value.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/temporary_buffer.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/uninitialized_algorithms.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/unique_ptr.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/uses_allocator.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/uses_allocator_construction.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory/voidify.h>", "private", "<memory>", "public" ] },
+  { include: [ "<__memory_resource/memory_resource.h>", "private", "<memory_resource>", "public" ] },
+  { include: [ "<__memory_resource/monotonic_buffer_resource.h>", "private", "<memory_resource>", "public" ] },
+  { include: [ "<__memory_resource/polymorphic_allocator.h>", "private", "<memory_resource>", "public" ] },
+  { include: [ "<__memory_resource/pool_options.h>", "private", "<memory_resource>", "public" ] },
+  { include: [ "<__memory_resource/synchronized_pool_resource.h>", "private", "<memory_resource>", "public" ] },
+  { include: [ "<__memory_resource/unsynchronized_pool_resource.h>", "private", "<memory_resource>", "public" ] },
+  { include: [ "<__mutex/lock_guard.h>", "private", "<mutex>", "public" ] },
+  { include: [ "<__mutex/mutex.h>", "private", "<mutex>", "public" ] },
+  { include: [ "<__mutex/once_flag.h>", "private", "<mutex>", "public" ] },
+  { include: [ "<__mutex/tag_types.h>", "private", "<mutex>", "public" ] },
+  { include: [ "<__mutex/unique_lock.h>", "private", "<mutex>", "public" ] },
+  { include: [ "<__numeric/accumulate.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/adjacent_difference.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/exclusive_scan.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/gcd_lcm.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/inclusive_scan.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/inner_product.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/iota.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/midpoint.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/partial_sum.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/pstl_reduce.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/pstl_transform_reduce.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/reduce.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/saturation_arithmetic.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/transform_exclusive_scan.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/transform_inclusive_scan.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/transform_reduce.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__random/bernoulli_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/binomial_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/cauchy_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/chi_squared_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/clamp_to_integral.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/default_random_engine.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/discard_block_engine.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/discrete_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/exponential_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/extreme_value_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/fisher_f_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/gamma_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/generate_canonical.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/geometric_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/independent_bits_engine.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/is_seed_sequence.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/is_valid.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/knuth_b.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/linear_congruential_engine.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/log2.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/lognormal_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/mersenne_twister_engine.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/negative_binomial_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/normal_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/piecewise_constant_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/piecewise_linear_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/poisson_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/random_device.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/ranlux.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/seed_seq.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/shuffle_order_engine.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/student_t_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/subtract_with_carry_engine.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/uniform_int_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/uniform_random_bit_generator.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/uniform_real_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__random/weibull_distribution.h>", "private", "<random>", "public" ] },
+  { include: [ "<__ranges/access.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/all.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/as_rvalue_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/chunk_by_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/common_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/concepts.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/container_compatible_range.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/counted.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/dangling.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/data.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/drop_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/drop_while_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/elements_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/empty.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/empty_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/enable_borrowed_range.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/enable_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/filter_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/from_range.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/iota_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/istream_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/join_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/lazy_split_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/movable_box.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/non_propagating_cache.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/owning_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/range_adaptor.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/rbegin.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/ref_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/rend.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/repeat_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/reverse_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/single_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/size.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/split_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/subrange.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/take_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/take_while_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/to.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/transform_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/view_interface.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/views.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__ranges/zip_view.h>", "private", "<ranges>", "public" ] },
+  { include: [ "<__stop_token/atomic_unique_lock.h>", "private", "<stop_token>", "public" ] },
+  { include: [ "<__stop_token/intrusive_list_view.h>", "private", "<stop_token>", "public" ] },
+  { include: [ "<__stop_token/intrusive_shared_ptr.h>", "private", "<stop_token>", "public" ] },
+  { include: [ "<__stop_token/stop_callback.h>", "private", "<stop_token>", "public" ] },
+  { include: [ "<__stop_token/stop_source.h>", "private", "<stop_token>", "public" ] },
+  { include: [ "<__stop_token/stop_state.h>", "private", "<stop_token>", "public" ] },
+  { include: [ "<__stop_token/stop_token.h>", "private", "<stop_token>", "public" ] },
+  { include: [ "<__string/char_traits.h>", "private", "<string>", "public" ] },
+  { include: [ "<__string/constexpr_c_functions.h>", "private", "<string>", "public" ] },
+  { include: [ "<__string/extern_template_lists.h>", "private", "<string>", "public" ] },
+  { include: [ "<__system_error/errc.h>", "private", "<system_error>", "public" ] },
+  { include: [ "<__system_error/error_category.h>", "private", "<system_error>", "public" ] },
+  { include: [ "<__system_error/error_code.h>", "private", "<system_error>", "public" ] },
+  { include: [ "<__system_error/error_condition.h>", "private", "<system_error>", "public" ] },
+  { include: [ "<__system_error/system_error.h>", "private", "<system_error>", "public" ] },
+  { include: [ "<__thread/formatter.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__thread/id.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__thread/jthread.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__thread/poll_with_backoff.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__thread/support.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__thread/support.h>", "private", "<mutex>", "public" ] },
+  { include: [ "<__thread/support.h>", "private", "<semaphore>", "public" ] },
+  { include: [ "<__thread/support.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__thread/support/c11.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__thread/support/c11.h>", "private", "<mutex>", "public" ] },
+  { include: [ "<__thread/support/c11.h>", "private", "<semaphore>", "public" ] },
+  { include: [ "<__thread/support/c11.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__thread/support/external.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__thread/support/external.h>", "private", "<mutex>", "public" ] },
+  { include: [ "<__thread/support/external.h>", "private", "<semaphore>", "public" ] },
+  { include: [ "<__thread/support/external.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__thread/support/pthread.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__thread/support/pthread.h>", "private", "<mutex>", "public" ] },
+  { include: [ "<__thread/support/pthread.h>", "private", "<semaphore>", "public" ] },
+  { include: [ "<__thread/support/pthread.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__thread/support/windows.h>", "private", "<atomic>", "public" ] },
+  { include: [ "<__thread/support/windows.h>", "private", "<mutex>", "public" ] },
+  { include: [ "<__thread/support/windows.h>", "private", "<semaphore>", "public" ] },
+  { include: [ "<__thread/support/windows.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__thread/this_thread.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__thread/thread.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__thread/timed_backoff_policy.h>", "private", "<thread>", "public" ] },
+  { include: [ "<__tuple/find_index.h>", "private", "<tuple>", "public" ] },
+  { include: [ "<__tuple/make_tuple_types.h>", "private", "<tuple>", "public" ] },
+  { include: [ "<__tuple/sfinae_helpers.h>", "private", "<tuple>", "public" ] },
+  { include: [ "<__tuple/tuple_element.h>", "private", "<tuple>", "public" ] },
+  { include: [ "<__tuple/tuple_indices.h>", "private", "<tuple>", "public" ] },
+  { include: [ "<__tuple/tuple_like.h>", "private", "<tuple>", "public" ] },
+  { include: [ "<__tuple/tuple_like_ext.h>", "private", "<tuple>", "public" ] },
+  { include: [ "<__tuple/tuple_like_no_subrange.h>", "private", "<tuple>", "public" ] },
+  { include: [ "<__tuple/tuple_size.h>", "private", "<tuple>", "public" ] },
+  { include: [ "<__tuple/tuple_types.h>", "private", "<tuple>", "public" ] },
+  { include: [ "<__type_traits/add_const.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/add_cv.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/add_lvalue_reference.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/add_pointer.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/add_rvalue_reference.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/add_volatile.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/aligned_storage.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/aligned_union.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/alignment_of.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/apply_cv.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/can_extract_key.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/common_reference.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/common_type.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/conditional.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/conjunction.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/copy_cv.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/copy_cvref.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/datasizeof.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/decay.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/dependent_type.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/desugars_to.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/disjunction.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/enable_if.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/extent.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/has_unique_object_representation.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/has_virtual_destructor.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/integral_constant.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/invoke.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_abstract.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_aggregate.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_allocator.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_always_bitcastable.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_arithmetic.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_array.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_assignable.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_base_of.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_bounded_array.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_callable.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_char_like_type.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_class.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_compound.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_const.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_constant_evaluated.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_constructible.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_convertible.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_core_convertible.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_destructible.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_empty.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_enum.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_equality_comparable.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_execution_policy.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_final.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_floating_point.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_function.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_fundamental.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_implicitly_default_constructible.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_integral.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_literal_type.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_member_function_pointer.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_member_object_pointer.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_member_pointer.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_nothrow_assignable.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_nothrow_constructible.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_nothrow_convertible.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_nothrow_destructible.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_null_pointer.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_object.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_pod.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_pointer.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_polymorphic.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_primary_template.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_reference.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_reference_wrapper.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_referenceable.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_same.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_scalar.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_scoped_enum.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_signed.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_signed_integer.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_specialization.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_standard_layout.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_swappable.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_trivial.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_trivially_assignable.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_trivially_constructible.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_trivially_copyable.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_trivially_destructible.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_trivially_lexicographically_comparable.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_trivially_relocatable.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_unbounded_array.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_union.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_unsigned.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_unsigned_integer.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_valid_expansion.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_void.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/is_volatile.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/lazy.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/make_32_64_or_128_bit.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/make_const_lvalue_ref.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/make_signed.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/make_unsigned.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/maybe_const.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/nat.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/negation.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/noexcept_move_assign_container.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/promote.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/rank.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/remove_all_extents.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/remove_const.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/remove_const_ref.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/remove_cv.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/remove_cvref.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/remove_extent.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/remove_pointer.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/remove_reference.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/remove_volatile.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/result_of.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/strip_signature.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/type_identity.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/type_list.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/underlying_type.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/unwrap_ref.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__type_traits/void_t.h>", "private", "<type_traits>", "public" ] },
+  { include: [ "<__utility/as_const.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/as_lvalue.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/auto_cast.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/cmp.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/convert_to_integral.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/declval.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/empty.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/exception_guard.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/exchange.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/forward.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/forward_like.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/in_place.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/integer_sequence.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/is_pointer_in_range.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/move.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/no_destroy.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/pair.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/piecewise_construct.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/priority_tag.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/rel_ops.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/small_buffer.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/swap.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/to_underlying.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__utility/unreachable.h>", "private", "<utility>", "public" ] },
+  { include: [ "<__variant/monostate.h>", "private", "<variant>", "public" ] },
+]
diff --git a/libcxx/include/list b/libcxx/include/list
index 90bddcc29db0..610a24e38460 100644
--- a/libcxx/include/list
+++ b/libcxx/include/list
@@ -567,7 +567,6 @@ protected:
     return __guard.__release_ptr();
   }
 
-  template <class... _Args>
   _LIBCPP_HIDE_FROM_ABI void __delete_node(__node_pointer __node) {
     // For the same reason as above, we use the allocator's destroy() method for the value_type,
     // but not for the node itself.
diff --git a/libcxx/include/locale b/libcxx/include/locale
index 748b276a8525..041d7bcd27fc 100644
--- a/libcxx/include/locale
+++ b/libcxx/include/locale
@@ -368,7 +368,11 @@ struct _LIBCPP_EXPORTED_FROM_ABI __num_get_base {
   static const int __num_get_buf_sz = 40;
 
   static int __get_base(ios_base&);
-  static const char __src[33];
+  static const char __src[33]; // "0123456789abcdefABCDEFxX+-pPiInN"
+  // count of leading characters in __src used for parsing integers ("012..X+-")
+  static const size_t __int_chr_cnt = 26;
+  // count of leading characters in __src used for parsing floating-point values ("012..-pP")
+  static const size_t __fp_chr_cnt = 28;
 };
 
 _LIBCPP_EXPORTED_FROM_ABI void
@@ -431,7 +435,7 @@ private:
   template <typename _Tp>
   const _Tp* __do_widen_p(ios_base& __iob, _Tp* __atoms) const {
     locale __loc = __iob.getloc();
-    use_facet<ctype<_Tp> >(__loc).widen(__src, __src + 26, __atoms);
+    use_facet<ctype<_Tp> >(__loc).widen(__src, __src + __int_chr_cnt, __atoms);
     return __atoms;
   }
 
@@ -447,7 +451,7 @@ private:
 template <class _CharT>
 string __num_get<_CharT>::__stage2_int_prep(ios_base& __iob, _CharT* __atoms, _CharT& __thousands_sep) {
   locale __loc = __iob.getloc();
-  std::use_facet<ctype<_CharT> >(__loc).widen(__src, __src + 26, __atoms);
+  std::use_facet<ctype<_CharT> >(__loc).widen(__src, __src + __int_chr_cnt, __atoms);
   const numpunct<_CharT>& __np = std::use_facet<numpunct<_CharT> >(__loc);
   __thousands_sep              = __np.thousands_sep();
   return __np.grouping();
@@ -458,7 +462,7 @@ template <class _CharT>
 string __num_get<_CharT>::__stage2_float_prep(
     ios_base& __iob, _CharT* __atoms, _CharT& __decimal_point, _CharT& __thousands_sep) {
   locale __loc = __iob.getloc();
-  std::use_facet<ctype<_CharT> >(__loc).widen(__src, __src + 32, __atoms);
+  std::use_facet<ctype<_CharT> >(__loc).widen(__src, __src + __fp_chr_cnt, __atoms);
   const numpunct<_CharT>& __np = std::use_facet<numpunct<_CharT> >(__loc);
   __decimal_point              = __np.decimal_point();
   __thousands_sep              = __np.thousands_sep();
@@ -490,7 +494,7 @@ __num_get<_CharT>::__stage2_int_loop(_CharT __ct, int __base, char* __a, char*&
     }
     return 0;
   }
-  ptrdiff_t __f = std::find(__atoms, __atoms + 26, __ct) - __atoms;
+  ptrdiff_t __f = std::find(__atoms, __atoms + __int_chr_cnt, __ct) - __atoms;
   if (__f >= 24)
     return -1;
   switch (__base) {
@@ -546,8 +550,8 @@ int __num_get<_CharT>::__stage2_float_loop(
     }
     return 0;
   }
-  ptrdiff_t __f = std::find(__atoms, __atoms + 32, __ct) - __atoms;
-  if (__f >= 32)
+  ptrdiff_t __f = std::find(__atoms, __atoms + __num_get_base::__fp_chr_cnt, __ct) - __atoms;
+  if (__f >= static_cast<ptrdiff_t>(__num_get_base::__fp_chr_cnt))
     return -1;
   char __x = __src[__f];
   if (__x == '-' || __x == '+') {
@@ -846,7 +850,7 @@ _InputIterator num_get<_CharT, _InputIterator>::__do_get_signed(
   int __base = this->__get_base(__iob);
   // Stage 2
   char_type __thousands_sep;
-  const int __atoms_size = 26;
+  const int __atoms_size = __num_get_base::__int_chr_cnt;
 #ifdef _LIBCPP_ABI_OPTIMIZED_LOCALE_NUM_GET
   char_type __atoms1[__atoms_size];
   const char_type* __atoms = this->__do_widen(__iob, __atoms1);
@@ -895,7 +899,7 @@ _InputIterator num_get<_CharT, _InputIterator>::__do_get_unsigned(
   int __base = this->__get_base(__iob);
   // Stage 2
   char_type __thousands_sep;
-  const int __atoms_size = 26;
+  const int __atoms_size = __num_get_base::__int_chr_cnt;
 #ifdef _LIBCPP_ABI_OPTIMIZED_LOCALE_NUM_GET
   char_type __atoms1[__atoms_size];
   const char_type* __atoms = this->__do_widen(__iob, __atoms1);
@@ -942,7 +946,7 @@ _InputIterator num_get<_CharT, _InputIterator>::__do_get_floating_point(
     iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, _Fp& __v) const {
   // Stage 1, nothing to do
   // Stage 2
-  char_type __atoms[32];
+  char_type __atoms[__num_get_base::__fp_chr_cnt];
   char_type __decimal_point;
   char_type __thousands_sep;
   string __grouping = this->__stage2_float_prep(__iob, __atoms, __decimal_point, __thousands_sep);
@@ -951,10 +955,11 @@ _InputIterator num_get<_CharT, _InputIterator>::__do_get_floating_point(
   char* __a     = &__buf[0];
   char* __a_end = __a;
   unsigned __g[__num_get_base::__num_get_buf_sz];
-  unsigned* __g_end = __g;
-  unsigned __dc     = 0;
-  bool __in_units   = true;
-  char __exp        = 'E';
+  unsigned* __g_end        = __g;
+  unsigned __dc            = 0;
+  bool __in_units          = true;
+  char __exp               = 'E';
+  bool __is_leading_parsed = false;
   for (; __b != __e; ++__b) {
     if (__a_end == __a + __buf.size()) {
       size_t __tmp = __buf.size();
@@ -977,6 +982,21 @@ _InputIterator num_get<_CharT, _InputIterator>::__do_get_floating_point(
             __dc,
             __atoms))
       break;
+
+    // the leading character excluding the sign must be a decimal digit
+    if (!__is_leading_parsed) {
+      if (__a_end - __a >= 1 && __a[0] != '-' && __a[0] != '+') {
+        if ('0' <= __a[0] && __a[0] <= '9')
+          __is_leading_parsed = true;
+        else
+          break;
+      } else if (__a_end - __a >= 2 && (__a[0] == '-' || __a[0] == '+')) {
+        if ('0' <= __a[1] && __a[1] <= '9')
+          __is_leading_parsed = true;
+        else
+          break;
+      }
+    }
   }
   if (__grouping.size() != 0 && __in_units && __g_end - __g < __num_get_base::__num_get_buf_sz)
     *__g_end++ = __dc;
@@ -996,10 +1016,11 @@ _InputIterator num_get<_CharT, _InputIterator>::do_get(
   // Stage 1
   int __base = 16;
   // Stage 2
-  char_type __atoms[26];
+  char_type __atoms[__num_get_base::__int_chr_cnt];
   char_type __thousands_sep = char_type();
   string __grouping;
-  std::use_facet<ctype<_CharT> >(__iob.getloc()).widen(__num_get_base::__src, __num_get_base::__src + 26, __atoms);
+  std::use_facet<ctype<_CharT> >(__iob.getloc())
+      .widen(__num_get_base::__src, __num_get_base::__src + __num_get_base::__int_chr_cnt, __atoms);
   string __buf;
   __buf.resize(__buf.capacity());
   char* __a     = &__buf[0];
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 70dac2f19846..8bc94d71391e 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -1066,7 +1066,11 @@ module std_private_atomic_atomic_flag         [system] {
 }
 module std_private_atomic_atomic_init         [system] { header "__atomic/atomic_init.h" }
 module std_private_atomic_atomic_lock_free    [system] { header "__atomic/atomic_lock_free.h" }
-module std_private_atomic_atomic_sync         [system] { header "__atomic/atomic_sync.h" }
+module std_private_atomic_atomic_ref          [system] { header "__atomic/atomic_ref.h" }
+module std_private_atomic_atomic_sync         [system] {
+  header "__atomic/atomic_sync.h"
+  export std_private_atomic_to_gcc_order
+}
 module std_private_atomic_check_memory_order  [system] { header "__atomic/check_memory_order.h" }
 module std_private_atomic_contention_t        [system] { header "__atomic/contention_t.h" }
 module std_private_atomic_cxx_atomic_impl     [system] { header "__atomic/cxx_atomic_impl.h" }
@@ -1074,6 +1078,10 @@ module std_private_atomic_fence               [system] { header "__atomic/fence.
 module std_private_atomic_is_always_lock_free [system] { header "__atomic/is_always_lock_free.h" }
 module std_private_atomic_kill_dependency     [system] { header "__atomic/kill_dependency.h" }
 module std_private_atomic_memory_order        [system] { header "__atomic/memory_order.h" }
+module std_private_atomic_to_gcc_order        [system] {
+  header "__atomic/to_gcc_order.h"
+  export std_private_atomic_memory_order
+}
 
 module std_private_bit_bit_cast       [system] { header "__bit/bit_cast.h" }
 module std_private_bit_bit_ceil       [system] { header "__bit/bit_ceil.h" }
diff --git a/libcxx/include/vector b/libcxx/include/vector
index 976bde9b9048..b190557fb7b7 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -424,11 +424,36 @@ public:
 #endif
       : __end_cap_(nullptr, __a) {
   }
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit vector(size_type __n);
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit vector(size_type __n) {
+    auto __guard = std::__make_exception_guard(__destroy_vector(*this));
+    if (__n > 0) {
+      __vallocate(__n);
+      __construct_at_end(__n);
+    }
+    __guard.__complete();
+  }
+
 #if _LIBCPP_STD_VER >= 14
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit vector(size_type __n, const allocator_type& __a);
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI explicit vector(size_type __n, const allocator_type& __a)
+      : __end_cap_(nullptr, __a) {
+    auto __guard = std::__make_exception_guard(__destroy_vector(*this));
+    if (__n > 0) {
+      __vallocate(__n);
+      __construct_at_end(__n);
+    }
+    __guard.__complete();
+  }
 #endif
-  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI vector(size_type __n, const value_type& __x);
+
+  _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI vector(size_type __n, const value_type& __x) {
+    auto __guard = std::__make_exception_guard(__destroy_vector(*this));
+    if (__n > 0) {
+      __vallocate(__n);
+      __construct_at_end(__n, __x);
+    }
+    __guard.__complete();
+  }
 
   template <__enable_if_t<__is_allocator<_Allocator>::value, int> = 0>
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI
@@ -1126,39 +1151,6 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<_Tp, _Allocator>::__append(size_type _
 }
 
 template <class _Tp, class _Allocator>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 vector<_Tp, _Allocator>::vector(size_type __n) {
-  auto __guard = std::__make_exception_guard(__destroy_vector(*this));
-  if (__n > 0) {
-    __vallocate(__n);
-    __construct_at_end(__n);
-  }
-  __guard.__complete();
-}
-
-#if _LIBCPP_STD_VER >= 14
-template <class _Tp, class _Allocator>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 vector<_Tp, _Allocator>::vector(size_type __n, const allocator_type& __a)
-    : __end_cap_(nullptr, __a) {
-  auto __guard = std::__make_exception_guard(__destroy_vector(*this));
-  if (__n > 0) {
-    __vallocate(__n);
-    __construct_at_end(__n);
-  }
-  __guard.__complete();
-}
-#endif
-
-template <class _Tp, class _Allocator>
-_LIBCPP_CONSTEXPR_SINCE_CXX20 vector<_Tp, _Allocator>::vector(size_type __n, const value_type& __x) {
-  auto __guard = std::__make_exception_guard(__destroy_vector(*this));
-  if (__n > 0) {
-    __vallocate(__n);
-    __construct_at_end(__n, __x);
-  }
-  __guard.__complete();
-}
-
-template <class _Tp, class _Allocator>
 template <class _InputIterator,
           __enable_if_t<__has_exactly_input_iterator_category<_InputIterator>::value &&
                             is_constructible<_Tp, typename iterator_traits<_InputIterator>::reference>::value,
diff --git a/libcxx/modules/std/atomic.inc b/libcxx/modules/std/atomic.inc
index d77d7a5bb0fb..e8cf90d01258 100644
--- a/libcxx/modules/std/atomic.inc
+++ b/libcxx/modules/std/atomic.inc
@@ -22,7 +22,7 @@ export namespace std {
 
   // [atomics.ref.generic], class template atomic_ref
   // [atomics.ref.pointer], partial specialization for pointers
-  // using std::atomic_ref _LIBCPP_USING_IF_EXISTS;
+  using std::atomic_ref _LIBCPP_USING_IF_EXISTS;
 
   // [atomics.types.generic], class template atomic
   using std::atomic _LIBCPP_USING_IF_EXISTS;
diff --git a/libcxx/test/libcxx/atomics/atomics.ref/assert.compare_exchange_strong.pass.cpp b/libcxx/test/libcxx/atomics/atomics.ref/assert.compare_exchange_strong.pass.cpp
new file mode 100644
index 000000000000..066ed1191dd0
--- /dev/null
+++ b/libcxx/test/libcxx/atomics/atomics.ref/assert.compare_exchange_strong.pass.cpp
@@ -0,0 +1,58 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-hardening-mode=none || libcpp-hardening-mode=fast
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+// ADDITIONAL_COMPILE_FLAGS: -Wno-user-defined-warnings
+
+// <atomic>
+
+// bool compare_exchange_strong(T& expected, T desired, memory_order success, memory_order failure) const noexcept;
+//
+// Preconditions: failure is memory_order::relaxed, memory_order::consume, memory_order::acquire, or memory_order::seq_cst.
+
+#include <atomic>
+
+#include "atomic_helpers.h"
+#include "check_assertion.h"
+
+template <typename T>
+struct TestCompareExchangeStrongInvalidMemoryOrder {
+  void operator()() const {
+    { // no assertion should trigger here
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+      T t(T(2));
+      a.compare_exchange_strong(t, T(3), std::memory_order_relaxed, std::memory_order_relaxed);
+    }
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ([] {
+          T x(T(1));
+          std::atomic_ref<T> const a(x);
+          T t(T(2));
+          a.compare_exchange_strong(t, T(3), std::memory_order_relaxed, std::memory_order_release);
+        }()),
+        "atomic_ref: failure memory order argument to strong atomic compare-and-exchange operation is invalid");
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ([] {
+          T x(T(1));
+          std::atomic_ref<T> const a(x);
+          T t(T(2));
+          a.compare_exchange_strong(t, T(3), std::memory_order_relaxed, std::memory_order_acq_rel);
+        }()),
+        "atomic_ref: failure memory order argument to strong atomic compare-and-exchange operation is invalid");
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestCompareExchangeStrongInvalidMemoryOrder>()();
+  return 0;
+}
diff --git a/libcxx/test/libcxx/atomics/atomics.ref/assert.compare_exchange_weak.pass.cpp b/libcxx/test/libcxx/atomics/atomics.ref/assert.compare_exchange_weak.pass.cpp
new file mode 100644
index 000000000000..e83a143df3f0
--- /dev/null
+++ b/libcxx/test/libcxx/atomics/atomics.ref/assert.compare_exchange_weak.pass.cpp
@@ -0,0 +1,58 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-hardening-mode=none || libcpp-hardening-mode=fast
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+// ADDITIONAL_COMPILE_FLAGS: -Wno-user-defined-warnings
+
+// <atomic>
+
+// bool compare_exchange_weak(T& expected, T desired, memory_order success, memory_order failure) const noexcept;
+//
+// Preconditions: failure is memory_order::relaxed, memory_order::consume, memory_order::acquire, or memory_order::seq_cst.
+
+#include <atomic>
+
+#include "atomic_helpers.h"
+#include "check_assertion.h"
+
+template <typename T>
+struct TestCompareExchangeWeakInvalidMemoryOrder {
+  void operator()() const {
+    { // no assertion should trigger here
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+      T t(T(2));
+      a.compare_exchange_weak(t, T(3), std::memory_order_relaxed, std::memory_order_relaxed);
+    }
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ([] {
+          T x(T(1));
+          std::atomic_ref<T> const a(x);
+          T t(T(2));
+          a.compare_exchange_weak(t, T(3), std::memory_order_relaxed, std::memory_order_release);
+        }()),
+        "atomic_ref: failure memory order argument to weak atomic compare-and-exchange operation is invalid");
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ([] {
+          T x(T(1));
+          std::atomic_ref<T> const a(x);
+          T t(T(2));
+          a.compare_exchange_weak(t, T(3), std::memory_order_relaxed, std::memory_order_acq_rel);
+        }()),
+        "atomic_ref: failure memory order argument to weak atomic compare-and-exchange operation is invalid");
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestCompareExchangeWeakInvalidMemoryOrder>()();
+  return 0;
+}
diff --git a/libcxx/test/libcxx/atomics/atomics.ref/assert.ctor.pass.cpp b/libcxx/test/libcxx/atomics/atomics.ref/assert.ctor.pass.cpp
new file mode 100644
index 000000000000..ef3705d1db27
--- /dev/null
+++ b/libcxx/test/libcxx/atomics/atomics.ref/assert.ctor.pass.cpp
@@ -0,0 +1,40 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-hardening-mode=none || libcpp-hardening-mode=fast
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+// <atomic>
+
+// atomic_ref(T& obj);
+//
+// Preconditions: The referenced object is aligned to required_alignment.
+
+#include <atomic>
+#include <cstddef>
+
+#include "check_assertion.h"
+
+int main(int, char**) {
+  { // no assertion should trigger here
+    alignas(float) std::byte c[sizeof(float)];
+    float* f = new (c) float(3.14f);
+    [[maybe_unused]] std::atomic_ref<float> r(*f);
+  }
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        alignas(float) std::byte c[2 * sizeof(float)]; // intentionally larger
+        float* f = new (c + 1) float(3.14f);           // intentionally misaligned
+        [[maybe_unused]] std::atomic_ref<float> r(*f);
+      }()),
+      "atomic_ref ctor: referenced object must be aligned to required_alignment");
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/atomics/atomics.ref/assert.load.pass.cpp b/libcxx/test/libcxx/atomics/atomics.ref/assert.load.pass.cpp
new file mode 100644
index 000000000000..bc92b3dc3622
--- /dev/null
+++ b/libcxx/test/libcxx/atomics/atomics.ref/assert.load.pass.cpp
@@ -0,0 +1,55 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-hardening-mode=none || libcpp-hardening-mode=fast
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+// ADDITIONAL_COMPILE_FLAGS: -Wno-user-defined-warnings
+
+// <atomic>
+
+// T load(memory_order order = memory_order::seq_cst) const noexcept;
+//
+// Preconditions: order is memory_order::relaxed, memory_order::consume, memory_order::acquire, or memory_order::seq_cst.
+
+#include <atomic>
+
+#include "atomic_helpers.h"
+#include "check_assertion.h"
+
+template <typename T>
+struct TestLoadInvalidMemoryOrder {
+  void operator()() const {
+    { // no assertion should trigger here
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+      (void)a.load(std::memory_order_relaxed);
+    }
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ([] {
+          T x(T(1));
+          std::atomic_ref<T> const a(x);
+          (void)a.load(std::memory_order_release);
+        }()),
+        "atomic_ref: memory order argument to atomic load operation is invalid");
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ([] {
+          T x(T(1));
+          std::atomic_ref<T> const a(x);
+          (void)a.load(std::memory_order_acq_rel);
+        }()),
+        "atomic_ref: memory order argument to atomic load operation is invalid");
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestLoadInvalidMemoryOrder>()();
+  return 0;
+}
diff --git a/libcxx/test/libcxx/atomics/atomics.ref/assert.store.pass.cpp b/libcxx/test/libcxx/atomics/atomics.ref/assert.store.pass.cpp
new file mode 100644
index 000000000000..ab0d4a220c94
--- /dev/null
+++ b/libcxx/test/libcxx/atomics/atomics.ref/assert.store.pass.cpp
@@ -0,0 +1,63 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-hardening-mode=none || libcpp-hardening-mode=fast
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+// ADDITIONAL_COMPILE_FLAGS: -Wno-user-defined-warnings
+
+// <atomic>
+
+// void store(T desired, memory_order order = memory_order::seq_cst) const noexcept;
+//
+// Preconditions: order is memory_order::relaxed, memory_order::release, or memory_order::seq_cst.
+
+#include <atomic>
+
+#include "atomic_helpers.h"
+#include "check_assertion.h"
+
+template <typename T>
+struct TestStoreInvalidMemoryOrder {
+  void operator()() const {
+    { // no assertion should trigger here
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+      a.store(T(2), std::memory_order_relaxed);
+    }
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ([] {
+          T x(T(1));
+          std::atomic_ref<T> const a(x);
+          a.store(T(2), std::memory_order_consume);
+        }()),
+        "atomic_ref: memory order argument to atomic store operation is invalid");
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ([] {
+          T x(T(1));
+          std::atomic_ref<T> const a(x);
+          a.store(T(2), std::memory_order_acquire);
+        }()),
+        "atomic_ref: memory order argument to atomic store operation is invalid");
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ([] {
+          T x(T(1));
+          std::atomic_ref<T> const a(x);
+          a.store(T(2), std::memory_order_acq_rel);
+        }()),
+        "atomic_ref: memory order argument to atomic store operation is invalid");
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestStoreInvalidMemoryOrder>()();
+  return 0;
+}
diff --git a/libcxx/test/libcxx/atomics/atomics.ref/assert.wait.pass.cpp b/libcxx/test/libcxx/atomics/atomics.ref/assert.wait.pass.cpp
new file mode 100644
index 000000000000..dcec2fb62854
--- /dev/null
+++ b/libcxx/test/libcxx/atomics/atomics.ref/assert.wait.pass.cpp
@@ -0,0 +1,55 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-hardening-mode=none || libcpp-hardening-mode=fast
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+// ADDITIONAL_COMPILE_FLAGS: -Wno-user-defined-warnings
+
+// <atomic>
+
+// void wait(T old, memory_order order = memory_order::seq_cst) const noexcept;
+//
+// Preconditions: order is memory_order::relaxed, memory_order::consume, memory_order::acquire, or memory_order::seq_cst.
+
+#include <atomic>
+
+#include "atomic_helpers.h"
+#include "check_assertion.h"
+
+template <typename T>
+struct TestWaitInvalidMemoryOrder {
+  void operator()() const {
+    { // no assertion should trigger here
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+      a.wait(T(2), std::memory_order_relaxed);
+    }
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ([] {
+          T x(T(1));
+          std::atomic_ref<T> const a(x);
+          a.wait(T(2), std::memory_order_release);
+        }()),
+        "atomic_ref: memory order argument to atomic wait operation is invalid");
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        ([] {
+          T x(T(1));
+          std::atomic_ref<T> const a(x);
+          a.wait(T(2), std::memory_order_acq_rel);
+        }()),
+        "atomic_ref: memory order argument to atomic wait operation is invalid");
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestWaitInvalidMemoryOrder>()();
+  return 0;
+}
diff --git a/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp b/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
index f94ceaf57dba..aa3ce210e363 100644
--- a/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
+++ b/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
@@ -21,6 +21,9 @@
 // GCC doesn't support the aligned-allocation flags.
 // XFAIL: gcc
 
+// TODO(mordante) fix this test after updating clang in Docker
+// UNSUPPORTED: clang-15, clang-16, clang-17, clang-18, clang-19
+
 // RUN: %{build} -faligned-allocation -fsized-deallocation
 // RUN: %{run}
 // RUN: %{build} -faligned-allocation -fno-sized-deallocation -DNO_SIZE
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.exception_handling.pass.cpp
deleted file mode 100644
index dda642be85bc..000000000000
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::fill(ExecutionPolicy) and std::fill_n(ExecutionPolicy) terminate on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-#ifndef TEST_HAS_NO_EXCEPTIONS
-struct ThrowOnCopy {
-  ThrowOnCopy& operator=(const ThrowOnCopy&) { throw int{}; }
-};
-#endif
-
-int main(int, char**) {
-  ThrowOnCopy a[2]{};
-  int b[2]{};
-
-  test_execution_policies([&](auto&& policy) {
-    // std::fill
-    EXPECT_STD_TERMINATE([&] { (void)std::fill(policy, std::begin(a), std::end(a), ThrowOnCopy{}); });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        (void)std::fill(
-            policy, util::throw_on_move_iterator(std::begin(b), 1), util::throw_on_move_iterator(std::end(b), 1), 0);
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-
-    // std::fill_n
-    EXPECT_STD_TERMINATE([&] { (void)std::fill_n(policy, std::begin(a), std::size(a), ThrowOnCopy{}); });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        (void)std::fill_n(policy, util::throw_on_move_iterator(std::begin(b), 1), std::size(b), 0);
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/pstl.exception_handling.pass.cpp
deleted file mode 100644
index bb8ab4217222..000000000000
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::move(ExecutionPolicy) terminates on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([](auto&& policy) {
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        int b[] = {1, 2};
-        (void)std::move(policy,
-                        util::throw_on_move_iterator(std::begin(a), 1),
-                        util::throw_on_move_iterator(std::end(a), 1),
-                        util::throw_on_move_iterator(std::begin(b), 1));
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.replace/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.replace/pstl.exception_handling.pass.cpp
deleted file mode 100644
index c02496bef421..000000000000
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.replace/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::replace(ExecutionPolicy), std::replace_if(ExecutionPolicy), std::replace_copy(ExecutionPolicy)
-// and std::replace_copy_if(ExecutionPolicy) terminate on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-struct ThrowOnCompare {};
-
-#ifndef TEST_HAS_NO_EXCEPTIONS
-bool operator==(ThrowOnCompare, ThrowOnCompare) { throw int{}; }
-#endif
-
-int main(int, char**) {
-  test_execution_policies([&](auto&& policy) {
-    // std::replace
-    EXPECT_STD_TERMINATE([&] {
-      ThrowOnCompare a[2]{};
-      (void)std::replace(policy, std::begin(a), std::end(a), ThrowOnCompare{}, ThrowOnCompare{});
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::replace(
-            policy, util::throw_on_move_iterator(std::begin(a), 1), util::throw_on_move_iterator(std::end(a), 1), 1, 2);
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-
-    // std::replace_if
-    EXPECT_STD_TERMINATE([&] {
-      ThrowOnCompare a[2]{};
-      (void)std::replace_if(
-          policy, std::begin(a), std::end(a), [](ThrowOnCompare&) -> bool { throw int{}; }, ThrowOnCompare{});
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::replace_if(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            [](int) { return true; },
-            2);
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-
-    // std::replace_copy
-    EXPECT_STD_TERMINATE([&] {
-      ThrowOnCompare a[2]{};
-      (void)std::replace_copy(policy, std::begin(a), std::end(a), std::begin(a), ThrowOnCompare{}, ThrowOnCompare{});
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::replace_copy(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            util::throw_on_move_iterator(std::begin(a), 1),
-            1,
-            2);
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-
-    // std::replace_copy_if
-    EXPECT_STD_TERMINATE([&] {
-      ThrowOnCompare a[2]{};
-      (void)std::replace_copy_if(
-          policy,
-          std::begin(a),
-          std::end(a),
-          std::begin(a),
-          [](ThrowOnCompare& i) { return i == i; },
-          ThrowOnCompare{});
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::replace_copy_if(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            util::throw_on_move_iterator(std::begin(a), 1),
-            [](int) { return true; },
-            2);
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/pstl.exception_handling.pass.cpp
deleted file mode 100644
index 88d177a6e39f..000000000000
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::find(ExecutionPolicy), std::find_if(ExecutionPolicy) and std::find_if_not(ExecutionPolicy) terminate
-// on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([](auto&& policy) {
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        int b[] = {1, 2};
-        (void)std::rotate_copy(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            util::throw_on_move_iterator(std::begin(b), 1));
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.transform/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.transform/pstl.exception_handling.pass.cpp
deleted file mode 100644
index 439204060e18..000000000000
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.transform/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::transform(ExecutionPolicy) terminates on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([&](auto&& policy) {
-    EXPECT_STD_TERMINATE([&] {
-      int a[2]{};
-      int b[2]{};
-      int c[2]{};
-      (void)std::transform(
-          policy, std::begin(a), std::end(a), std::begin(b), std::begin(c), [](auto v, auto) -> decltype(v) {
-            throw int{};
-          });
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::transform(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            util::throw_on_move_iterator(std::begin(a), 1),
-            [](int i) { return i; });
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-
-    EXPECT_STD_TERMINATE([&] {
-      int a[2]{};
-      int b[2]{};
-      (void)std::transform(policy, std::begin(a), std::end(a), std::begin(b), [](auto v) -> decltype(v) {
-        throw int{};
-      });
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::transform(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::begin(a), 1),
-            std::plus{});
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.all_of/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.all_of/pstl.exception_handling.pass.cpp
deleted file mode 100644
index d1c031bdd97a..000000000000
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.all_of/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::all_of(ExecutionPolicy) terminates on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([](auto&& policy) {
-    EXPECT_STD_TERMINATE([&] {
-      int a[] = {1, 2};
-      (void)std::all_of(policy, std::begin(a), std::end(a), [](int i) -> bool { throw i; });
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::all_of(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            [](int) { return true; });
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.any_of/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.any_of/pstl.exception_handling.pass.cpp
deleted file mode 100644
index 58fe79b34c00..000000000000
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.any_of/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::any_of(ExecutionPolicy) terminates on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([](auto&& policy) {
-    EXPECT_STD_TERMINATE([&] {
-      int a[] = {1, 2};
-      (void)std::any_of(policy, std::begin(a), std::end(a), [](int i) -> bool { throw i; });
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::any_of(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            [](int) { return true; });
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/pstl.exception_handling.pass.cpp
deleted file mode 100644
index 1bcd858f3c02..000000000000
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::equal(ExecutionPolicy) terminates on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([](auto&& policy) {
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        int b[] = {1, 2};
-        (void)std::equal(policy,
-                         util::throw_on_move_iterator(std::begin(a), 1),
-                         util::throw_on_move_iterator(std::end(a), 1),
-                         util::throw_on_move_iterator(std::begin(b), 1));
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        int b[] = {1, 2};
-        (void)std::equal(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            util::throw_on_move_iterator(std::begin(b), 1),
-            util::throw_on_move_iterator(std::end(b), 1));
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/pstl.exception_handling.pass.cpp
deleted file mode 100644
index b0ee4f8d062e..000000000000
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::find(ExecutionPolicy), std::find_if(ExecutionPolicy) and std::find_if_not(ExecutionPolicy) terminate
-// on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-struct ThrowOnCompare {};
-
-#ifndef TEST_HAS_NO_EXCEPTIONS
-bool operator==(ThrowOnCompare, ThrowOnCompare) { throw int{}; }
-#endif
-
-int main(int, char**) {
-  test_execution_policies([](auto&& policy) {
-    // std::find
-    EXPECT_STD_TERMINATE([&] {
-      ThrowOnCompare a[2] = {};
-      (void)std::find(policy, std::begin(a), std::end(a), ThrowOnCompare{});
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::find(
-            policy, util::throw_on_move_iterator(std::begin(a), 1), util::throw_on_move_iterator(std::end(a), 1), 0);
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-
-    // std::find_if
-    EXPECT_STD_TERMINATE([&] {
-      int a[] = {1, 2};
-      (void)std::find_if(policy, std::begin(a), std::end(a), [](int) -> bool { throw int{}; });
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::find_if(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            [](int) { return true; });
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-
-    // std::find_if_not
-    EXPECT_STD_TERMINATE([&] {
-      int a[] = {1, 2};
-      (void)std::find_if_not(policy, std::begin(a), std::end(a), [](int) -> bool { throw int{}; });
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::find_if_not(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            [](int) { return true; });
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/pstl.exception_handling.pass.cpp
deleted file mode 100644
index a63276f1e025..000000000000
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::for_each(ExecutionPolicy) and std::for_each_n(ExecutionPolicy) terminate on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([](auto&& policy) {
-    int a[] = {1, 2};
-    // std::for_each
-    EXPECT_STD_TERMINATE([&] { std::for_each(policy, std::begin(a), std::end(a), [](int) { throw int{}; }); });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        (void)std::for_each(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            [](int) {});
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-
-    // std::for_each_n
-    EXPECT_STD_TERMINATE([&] { std::for_each_n(policy, std::data(a), std::size(a), [](int) { throw int{}; }); });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        (void)std::for_each_n(policy, util::throw_on_move_iterator(std::begin(a), 1), std::size(a), [](int) {});
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.none_of/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.none_of/pstl.exception_handling.pass.cpp
deleted file mode 100644
index 26e6fea5904f..000000000000
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.none_of/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::none_of(ExecutionPolicy) terminates on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([](auto&& policy) {
-    EXPECT_STD_TERMINATE([&] {
-      int a[] = {1, 2};
-      (void)std::none_of(policy, std::begin(a), std::end(a), [](int i) -> bool { throw i; });
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::none_of(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            [](int) { return true; });
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.merge/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.merge/pstl.exception_handling.pass.cpp
deleted file mode 100644
index b48a5a9fa2b7..000000000000
--- a/libcxx/test/std/algorithms/alg.sorting/alg.merge/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::merge(ExecutionPolicy) terminates on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([](auto&& policy) {
-    EXPECT_STD_TERMINATE([&] {
-      int a[] = {1, 2};
-      std::merge(policy, std::begin(a), std::end(a), std::begin(a), std::end(a), std::begin(a), [](int, int) -> bool {
-        throw int{};
-      });
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::merge(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            util::throw_on_move_iterator(std::begin(a), 1),
-            std::less{});
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-
-  return 0;
-}
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.sort/stable.sort/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.sort/stable.sort/pstl.exception_handling.pass.cpp
deleted file mode 100644
index 1dc603cfaa55..000000000000
--- a/libcxx/test/std/algorithms/alg.sorting/alg.sort/stable.sort/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::stable_sort(ExecutionPolicy) terminates on user-thrown exceptions
-
-#include <algorithm>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([](auto&& policy) {
-    EXPECT_STD_TERMINATE([&] {
-      int a[] = {1, 2};
-      std::stable_sort(policy, std::begin(a), std::end(a), [](int, int) -> bool { throw int{}; });
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::stable_sort(
-            policy, util::throw_on_move_iterator(std::begin(a), 1), util::throw_on_move_iterator(std::end(a), 1));
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/numeric.ops/reduce/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/numeric.ops/reduce/pstl.exception_handling.pass.cpp
deleted file mode 100644
index d52889b1be14..000000000000
--- a/libcxx/test/std/algorithms/numeric.ops/reduce/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::reduce(ExecutionPolicy) terminates on user-thrown exceptions
-
-#include <numeric>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([&](auto&& policy) {
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::reduce(
-            policy, util::throw_on_move_iterator(std::begin(a), 1), util::throw_on_move_iterator(std::end(a), 1));
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-
-    EXPECT_STD_TERMINATE([&] {
-      int a[2]{};
-      (void)std::reduce(policy, std::begin(a), std::end(a), 1, [](int, int) -> int { throw 1; });
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::reduce(
-            policy, util::throw_on_move_iterator(std::begin(a), 1), util::throw_on_move_iterator(std::end(a), 1), 1);
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/numeric.ops/transform.reduce/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/numeric.ops/transform.reduce/pstl.exception_handling.pass.cpp
deleted file mode 100644
index 5ac04334f000..000000000000
--- a/libcxx/test/std/algorithms/numeric.ops/transform.reduce/pstl.exception_handling.pass.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: no-exceptions
-// `check_assertion.h` requires Unix headers and regex support.
-// UNSUPPORTED: !has-unix-headers, no-localization
-
-// UNSUPPORTED: libcpp-has-no-incomplete-pstl
-
-// check that std::reduce(ExecutionPolicy) terminates on user-thrown exceptions
-
-#include <numeric>
-
-#include "check_assertion.h"
-#include "test_execution_policies.h"
-#include "test_iterators.h"
-
-int main(int, char**) {
-  test_execution_policies([&](auto&& policy) {
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::transform_reduce(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            util::throw_on_move_iterator(std::begin(a), 1),
-            1);
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-
-    EXPECT_STD_TERMINATE([&] {
-      int a[2]{};
-      (void)std::transform_reduce(
-          policy, std::begin(a), std::end(a), 1, [](int, int) -> int { throw 1; }, [](int) -> int { return 0; });
-    });
-    EXPECT_STD_TERMINATE([&] {
-      try {
-        int a[] = {1, 2};
-        (void)std::transform_reduce(
-            policy,
-            util::throw_on_move_iterator(std::begin(a), 1),
-            util::throw_on_move_iterator(std::end(a), 1),
-            1,
-            std::plus{},
-            [](int) -> int { return 0; });
-      } catch (const util::iterator_error&) {
-        assert(false);
-      }
-      std::terminate(); // make the test pass in case the algorithm didn't move the iterator
-    });
-  });
-}
diff --git a/libcxx/test/std/algorithms/pstl.exception_handling.pass.cpp b/libcxx/test/std/algorithms/pstl.exception_handling.pass.cpp
new file mode 100644
index 000000000000..bedb2258d1fd
--- /dev/null
+++ b/libcxx/test/std/algorithms/pstl.exception_handling.pass.cpp
@@ -0,0 +1,339 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+// UNSUPPORTED: no-exceptions
+// `check_assertion.h` requires Unix headers and regex support.
+// UNSUPPORTED: !has-unix-headers, no-localization
+
+// UNSUPPORTED: libcpp-has-no-incomplete-pstl
+
+// <algorithm>
+// <numeric>
+//
+// Check that PSTL algorithms terminate on user-thrown exceptions.
+
+#include <algorithm>
+#include <numeric>
+
+#include "check_assertion.h"
+#include "test_execution_policies.h"
+#include "test_iterators.h"
+
+template <class F>
+void assert_non_throwing(F f) {
+  // We wrap this whole test in EXPECT_STD_TERMINATE because if f() terminates, we want the test to pass,
+  // since this signals proper handling of user exceptions in the PSTL.
+  EXPECT_STD_TERMINATE([&] {
+    bool threw = false;
+    try {
+      f();
+    } catch (...) {
+      threw = true;
+    }
+    // If nothing was thrown, call std::terminate() to pass the EXPECT_STD_TERMINATE assertion.
+    // Otherwise, don't call std::terminate() to fail the assertion.
+    if (!threw)
+      std::terminate();
+  });
+}
+
+struct ThrowToken {
+  void activate() { active_ = true; }
+  void deactivate() { active_ = false; }
+  bool active() const { return active_; }
+
+private:
+  bool active_{false};
+};
+
+template <class Func>
+struct on_scope_exit {
+  explicit on_scope_exit(Func func) : func_(func) {}
+  ~on_scope_exit() { func_(); }
+
+private:
+  Func func_;
+};
+template <class Func>
+on_scope_exit(Func) -> on_scope_exit<Func>;
+
+int main(int, char**) {
+  test_execution_policies([&](auto&& policy) {
+    int a[] = {1, 2, 3, 4};
+    int b[] = {1, 2, 3};
+    int n   = 2;
+    int storage[999];
+    int val  = 99;
+    int init = 1;
+
+    // We generate a certain number of "tokens" and we activate exactly one on each iteration. We then
+    // throw in a given operation only when that token is active. That way we check that each argument
+    // of the algorithm is handled properly.
+    ThrowToken tokens[7];
+    for (ThrowToken& t : tokens) {
+      t.activate();
+      on_scope_exit _([&] { t.deactivate(); });
+
+      auto first1      = util::throw_on_move_iterator(std::begin(a), tokens[0].active() ? 1 : -1);
+      auto last1       = util::throw_on_move_iterator(std::end(a), tokens[1].active() ? 1 : -1);
+      auto first2      = util::throw_on_move_iterator(std::begin(b), tokens[2].active() ? 1 : -1);
+      auto last2       = util::throw_on_move_iterator(std::end(b), tokens[3].active() ? 1 : -1);
+      auto dest        = util::throw_on_move_iterator(std::end(storage), tokens[4].active() ? 1 : -1);
+      auto maybe_throw = [](ThrowToken const& token, auto f) {
+        return [&token, f](auto... args) {
+          if (token.active())
+            throw 1;
+          return f(args...);
+        };
+      };
+
+      {
+        auto pred = maybe_throw(tokens[5], [](int x) -> bool { return x % 2 == 0; });
+
+        // all_of(first, last, pred)
+        assert_non_throwing([=, &policy] { (void)std::all_of(policy, std::move(first1), std::move(last1), pred); });
+
+        // any_of(first, last, pred)
+        assert_non_throwing([=, &policy] { (void)std::any_of(policy, std::move(first1), std::move(last1), pred); });
+
+        // none_of(first, last, pred)
+        assert_non_throwing([=, &policy] { (void)std::none_of(policy, std::move(first1), std::move(last1), pred); });
+      }
+
+      {
+        // copy(first, last, dest)
+        assert_non_throwing([=, &policy] {
+          (void)std::copy(policy, std::move(first1), std::move(last1), std::move(dest));
+        });
+
+        // copy_n(first, n, dest)
+        assert_non_throwing([=, &policy] { (void)std::copy_n(policy, std::move(first1), n, std::move(dest)); });
+      }
+
+      {
+        auto pred = maybe_throw(tokens[5], [](int x) -> bool { return x % 2 == 0; });
+
+        // count(first, last, val)
+        assert_non_throwing([=, &policy] { (void)std::count(policy, std::move(first1), std::move(last1), val); });
+
+        // count_if(first, last, pred)
+        assert_non_throwing([=, &policy] { (void)std::count_if(policy, std::move(first1), std::move(last1), pred); });
+      }
+
+      {
+        auto binary_pred = maybe_throw(tokens[5], [](int x, int y) -> bool { return x == y; });
+
+        // equal(first1, last1, first2)
+        assert_non_throwing([=, &policy] {
+          (void)std::equal(policy, std::move(first1), std::move(last1), std::move(first2));
+        });
+
+        // equal(first1, last1, first2, binary_pred)
+        assert_non_throwing([=, &policy] {
+          (void)std::equal(policy, std::move(first1), std::move(last1), std::move(first2), binary_pred);
+        });
+
+        // equal(first1, last1, first2, last2)
+        assert_non_throwing([=, &policy] {
+          (void)std::equal(policy, std::move(first1), std::move(last1), std::move(first2), std::move(last2));
+        });
+
+        // equal(first1, last1, first2, last2, binary_pred)
+        assert_non_throwing([=, &policy] {
+          (void)std::equal(
+              policy, std::move(first1), std::move(last1), std::move(first2), std::move(last2), binary_pred);
+        });
+      }
+
+      {
+        // fill(first, last, val)
+        assert_non_throwing([=, &policy] { (void)std::fill(policy, std::move(first1), std::move(last1), val); });
+
+        // fill_n(first, n, val)
+        assert_non_throwing([=, &policy] { (void)std::fill_n(policy, std::move(first1), n, val); });
+      }
+
+      {
+        auto pred = maybe_throw(tokens[5], [](int x) -> bool { return x % 2 == 0; });
+
+        // find(first, last, val)
+        assert_non_throwing([=, &policy] { (void)std::find(policy, std::move(first1), std::move(last1), val); });
+
+        // find_if(first, last, pred)
+        assert_non_throwing([=, &policy] { (void)std::find_if(policy, std::move(first1), std::move(last1), pred); });
+
+        // find_if_not(first, last, pred)
+        assert_non_throwing([=, &policy] {
+          (void)std::find_if_not(policy, std::move(first1), std::move(last1), pred);
+        });
+      }
+
+      {
+        auto func = maybe_throw(tokens[5], [](int) {});
+
+        // for_each(first, last, func)
+        assert_non_throwing([=, &policy] { (void)std::for_each(policy, std::move(first1), std::move(last1), func); });
+
+        // for_each_n(first, n, func)
+        assert_non_throwing([=, &policy] { (void)std::for_each_n(policy, std::move(first1), n, func); });
+      }
+
+      {
+        auto gen = maybe_throw(tokens[5], []() -> int { return 42; });
+
+        // generate(first, last, func)
+        assert_non_throwing([=, &policy] { (void)std::generate(policy, std::move(first1), std::move(last1), gen); });
+
+        // generate_n(first, n, func)
+        assert_non_throwing([=, &policy] { (void)std::generate_n(policy, std::move(first1), n, gen); });
+      }
+
+      {
+        auto pred = maybe_throw(tokens[5], [](int x) -> bool { return x % 2 == 0; });
+
+        // is_partitioned(first, last, pred)
+        assert_non_throwing([=, &policy] {
+          (void)std::is_partitioned(policy, std::move(first1), std::move(last1), pred);
+        });
+      }
+
+      {
+        auto compare = maybe_throw(tokens[5], [](int x, int y) -> bool { return x < y; });
+
+        // merge(first1, last1, first2, last2, dest)
+        assert_non_throwing([=, &policy] {
+          (void)std::merge(
+              policy, std::move(first1), std::move(last1), std::move(first2), std::move(last2), std::move(dest));
+        });
+
+        // merge(first1, last1, first2, last2, dest, comp)
+        assert_non_throwing([=, &policy] {
+          (void)std::merge(
+              policy,
+              std::move(first1),
+              std::move(last1),
+              std::move(first2),
+              std::move(last2),
+              std::move(dest),
+              compare);
+        });
+      }
+
+      {
+        // move(first, last, dest)
+        assert_non_throwing([=, &policy] {
+          (void)std::move(policy, std::move(first1), std::move(last1), std::move(dest));
+        });
+      }
+
+      {
+        auto pred = maybe_throw(tokens[5], [](int x) -> bool { return x % 2 == 0; });
+
+        // replace_if(first, last, pred, val)
+        assert_non_throwing([=, &policy] {
+          (void)std::replace_if(policy, std::move(first1), std::move(last1), pred, val);
+        });
+
+        // replace(first, last, val1, val2)
+        assert_non_throwing([=, &policy] {
+          (void)std::replace(policy, std::move(first1), std::move(last1), val, val);
+        });
+
+        // replace_copy_if(first, last, dest, pred, val)
+        assert_non_throwing([=, &policy] {
+          (void)std::replace_copy_if(policy, std::move(first1), std::move(last1), std::move(dest), pred, val);
+        });
+
+        // replace_copy(first, last, dest, val1, val2)
+        assert_non_throwing([=, &policy] {
+          (void)std::replace_copy(policy, std::move(first1), std::move(last1), std::move(dest), val, val);
+        });
+      }
+
+      {
+        auto mid1 = util::throw_on_move_iterator(std::begin(a) + 2, tokens[5].active() ? 1 : -1);
+
+        // rotate_copy(first, mid, last, dest)
+        assert_non_throwing([=, &policy] {
+          (void)std::rotate_copy(policy, std::move(first1), std::move(mid1), std::move(last1), std::move(dest));
+        });
+      }
+
+      {
+        auto compare = maybe_throw(tokens[5], [](int x, int y) -> bool { return x < y; });
+
+        // sort(first, last)
+        assert_non_throwing([=, &policy] { (void)std::sort(policy, std::move(first1), std::move(last1)); });
+
+        // sort(first, last, comp)
+        assert_non_throwing([=, &policy] { (void)std::sort(policy, std::move(first1), std::move(last1), compare); });
+
+        // stable_sort(first, last)
+        assert_non_throwing([=, &policy] { (void)std::stable_sort(policy, std::move(first1), std::move(last1)); });
+
+        // stable_sort(first, last, comp)
+        assert_non_throwing([=, &policy] {
+          (void)std::stable_sort(policy, std::move(first1), std::move(last1), compare);
+        });
+      }
+
+      {
+        auto unary  = maybe_throw(tokens[5], [](int x) -> int { return x * 2; });
+        auto binary = maybe_throw(tokens[5], [](int x, int y) -> int { return x * y; });
+
+        // transform(first, last, dest, func)
+        assert_non_throwing([=, &policy] {
+          (void)std::transform(policy, std::move(first1), std::move(last1), std::move(dest), unary);
+        });
+
+        // transform(first1, last1, first2, dest, func)
+        assert_non_throwing([=, &policy] {
+          (void)std::transform(policy, std::move(first1), std::move(last1), std::move(first2), std::move(dest), binary);
+        });
+      }
+
+      {
+        auto reduction        = maybe_throw(tokens[5], [](int x, int y) -> int { return x + y; });
+        auto transform_unary  = maybe_throw(tokens[6], [](int x) -> int { return x * 2; });
+        auto transform_binary = maybe_throw(tokens[6], [](int x, int y) -> int { return x * y; });
+
+        // transform_reduce(first1, last1, first2, init)
+        assert_non_throwing([=, &policy] {
+          (void)std::transform_reduce(policy, std::move(first1), std::move(last1), std::move(first2), init);
+        });
+
+        // transform_reduce(first1, last1, init, reduce, transform)
+        assert_non_throwing([=, &policy] {
+          (void)std::transform_reduce(policy, std::move(first1), std::move(last1), init, reduction, transform_unary);
+        });
+
+        // transform_reduce(first1, last1, first2, init, reduce, transform)
+        assert_non_throwing([=, &policy] {
+          (void)std::transform_reduce(
+              policy, std::move(first1), std::move(last1), std::move(first2), init, reduction, transform_binary);
+        });
+      }
+
+      {
+        auto reduction = maybe_throw(tokens[5], [](int x, int y) -> int { return x + y; });
+
+        // reduce(first, last)
+        assert_non_throwing([=, &policy] { (void)std::reduce(policy, std::move(first1), std::move(last1)); });
+
+        // reduce(first, last, init)
+        assert_non_throwing([=, &policy] { (void)std::reduce(policy, std::move(first1), std::move(last1), init); });
+
+        // reduce(first, last, init, binop)
+        assert_non_throwing([=, &policy] {
+          (void)std::reduce(policy, std::move(first1), std::move(last1), init, reduction);
+        });
+      }
+    }
+  });
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/assign.pass.cpp b/libcxx/test/std/atomics/atomics.ref/assign.pass.cpp
new file mode 100644
index 000000000000..3887211752c6
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/assign.pass.cpp
@@ -0,0 +1,50 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+// XFAIL: !has-1024-bit-atomics
+
+// T operator=(T) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_helper.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestAssign {
+  void operator()() const {
+    {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      std::same_as<T> decltype(auto) y = (a = T(2));
+      assert(y == T(2));
+      assert(x == T(2));
+
+      ASSERT_NOEXCEPT(a = T(0));
+      static_assert(std::is_nothrow_assignable_v<std::atomic_ref<T>, T>);
+
+      static_assert(!std::is_copy_assignable_v<std::atomic_ref<T>>);
+    }
+
+    {
+      auto assign = [](std::atomic_ref<T> const& y, T, T new_val) { y = new_val; };
+      auto load   = [](std::atomic_ref<T> const& y) { return y.load(); };
+      test_seq_cst<T>(assign, load);
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestAssign>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/bitwise_and_assign.pass.cpp b/libcxx/test/std/atomics/atomics.ref/bitwise_and_assign.pass.cpp
new file mode 100644
index 000000000000..2be1e9962880
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/bitwise_and_assign.pass.cpp
@@ -0,0 +1,60 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+
+// integral-type operator&=(integral-type) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_macros.h"
+
+template <typename T>
+concept has_bitwise_and_assign = requires { std::declval<T const>() &= std::declval<T>(); };
+
+template <typename T>
+struct TestDoesNotHaveBitwiseAndAssign {
+  void operator()() const { static_assert(!has_bitwise_and_assign<std::atomic_ref<T>>); }
+};
+
+template <typename T>
+struct TestBitwiseAndAssign {
+  void operator()() const {
+    static_assert(std::is_integral_v<T>);
+
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    std::same_as<T> decltype(auto) y = (a &= T(1));
+    assert(y == T(1));
+    assert(x == T(1));
+    ASSERT_NOEXCEPT(a &= T(0));
+
+    y = (a &= T(2));
+    assert(y == T(0));
+    assert(x == T(0));
+  }
+};
+
+int main(int, char**) {
+  TestEachIntegralType<TestBitwiseAndAssign>()();
+
+  TestEachFloatingPointType<TestDoesNotHaveBitwiseAndAssign>()();
+
+  TestEachPointerType<TestDoesNotHaveBitwiseAndAssign>()();
+
+  TestDoesNotHaveBitwiseAndAssign<bool>()();
+  TestDoesNotHaveBitwiseAndAssign<UserAtomicType>()();
+  TestDoesNotHaveBitwiseAndAssign<LargeUserAtomicType>()();
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/bitwise_or_assign.pass.cpp b/libcxx/test/std/atomics/atomics.ref/bitwise_or_assign.pass.cpp
new file mode 100644
index 000000000000..5c22c8a2b2b6
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/bitwise_or_assign.pass.cpp
@@ -0,0 +1,56 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+
+// integral-type operator|=(integral-type) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_macros.h"
+
+template <typename T>
+concept has_bitwise_or_assign = requires { std::declval<T const>() |= std::declval<T>(); };
+
+template < typename T>
+struct TestDoesNotHaveBitwiseOrAssign {
+  void operator()() const { static_assert(!has_bitwise_or_assign<std::atomic_ref<T>>); }
+};
+
+template <typename T>
+struct TestBitwiseOrAssign {
+  void operator()() const {
+    static_assert(std::is_integral_v<T>);
+
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    std::same_as<T> decltype(auto) y = (a |= T(2));
+    assert(y == T(3));
+    assert(x == T(3));
+    ASSERT_NOEXCEPT(a |= T(0));
+  }
+};
+
+int main(int, char**) {
+  TestEachIntegralType<TestBitwiseOrAssign>()();
+
+  TestEachFloatingPointType<TestDoesNotHaveBitwiseOrAssign>()();
+
+  TestEachPointerType<TestDoesNotHaveBitwiseOrAssign>()();
+
+  TestDoesNotHaveBitwiseOrAssign<bool>()();
+  TestDoesNotHaveBitwiseOrAssign<UserAtomicType>()();
+  TestDoesNotHaveBitwiseOrAssign<LargeUserAtomicType>()();
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/bitwise_xor_assign.pass.cpp b/libcxx/test/std/atomics/atomics.ref/bitwise_xor_assign.pass.cpp
new file mode 100644
index 000000000000..4dc4fd307f58
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/bitwise_xor_assign.pass.cpp
@@ -0,0 +1,56 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+
+// integral-type operator|=(integral-type) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_macros.h"
+
+template <typename T>
+concept has_bitwise_xor_assign = requires { std::declval<T const>() ^= std::declval<T>(); };
+
+template <typename T>
+struct TestDoesNotHaveBitwiseXorAssign {
+  void operator()() const { static_assert(!has_bitwise_xor_assign<std::atomic_ref<float>>); }
+};
+
+template <typename T>
+struct TestBitwiseXorAssign {
+  void operator()() const {
+    static_assert(std::is_integral_v<T>);
+
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    std::same_as<T> decltype(auto) y = (a ^= T(2));
+    assert(y == T(3));
+    assert(x == T(3));
+    ASSERT_NOEXCEPT(a ^= T(0));
+  }
+};
+
+int main(int, char**) {
+  TestEachIntegralType<TestBitwiseXorAssign>()();
+
+  TestEachFloatingPointType<TestDoesNotHaveBitwiseXorAssign>()();
+
+  TestEachPointerType<TestDoesNotHaveBitwiseXorAssign>()();
+
+  TestDoesNotHaveBitwiseXorAssign<bool>()();
+  TestDoesNotHaveBitwiseXorAssign<UserAtomicType>()();
+  TestDoesNotHaveBitwiseXorAssign<LargeUserAtomicType>()();
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp b/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp
new file mode 100644
index 000000000000..72b2f444c476
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/compare_exchange_strong.pass.cpp
@@ -0,0 +1,221 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+// XFAIL: !has-1024-bit-atomics
+
+// bool compare_exchange_strong(T&, T, memory_order, memory_order) const noexcept;
+// bool compare_exchange_strong(T&, T, memory_order = memory_order::seq_cst) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_helper.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestCompareExchangeStrong {
+  void operator()() const {
+    {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      T t(T(1));
+      std::same_as<bool> decltype(auto) y = a.compare_exchange_strong(t, T(2));
+      assert(y == true);
+      assert(a == T(2));
+      assert(t == T(1));
+      y = a.compare_exchange_strong(t, T(3));
+      assert(y == false);
+      assert(a == T(2));
+      assert(t == T(2));
+
+      ASSERT_NOEXCEPT(a.compare_exchange_strong(t, T(2)));
+    }
+    {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      T t(T(1));
+      std::same_as<bool> decltype(auto) y = a.compare_exchange_strong(t, T(2), std::memory_order_seq_cst);
+      assert(y == true);
+      assert(a == T(2));
+      assert(t == T(1));
+      y = a.compare_exchange_strong(t, T(3), std::memory_order_seq_cst);
+      assert(y == false);
+      assert(a == T(2));
+      assert(t == T(2));
+
+      ASSERT_NOEXCEPT(a.compare_exchange_strong(t, T(2), std::memory_order_seq_cst));
+    }
+    {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      T t(T(1));
+      std::same_as<bool> decltype(auto) y =
+          a.compare_exchange_strong(t, T(2), std::memory_order_release, std::memory_order_relaxed);
+      assert(y == true);
+      assert(a == T(2));
+      assert(t == T(1));
+      y = a.compare_exchange_strong(t, T(3), std::memory_order_release, std::memory_order_relaxed);
+      assert(y == false);
+      assert(a == T(2));
+      assert(t == T(2));
+
+      ASSERT_NOEXCEPT(a.compare_exchange_strong(t, T(2), std::memory_order_release, std::memory_order_relaxed));
+    }
+
+    // success memory_order::release
+    {
+      auto store = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        auto r = x.compare_exchange_strong(old_val, new_val, std::memory_order::release, std::memory_order::relaxed);
+        assert(r);
+      };
+
+      auto load = [](std::atomic_ref<T> const& x) { return x.load(std::memory_order::acquire); };
+      test_acquire_release<T>(store, load);
+
+      auto store_one_arg = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        auto r = x.compare_exchange_strong(old_val, new_val, std::memory_order::release);
+        assert(r);
+      };
+      test_acquire_release<T>(store_one_arg, load);
+    }
+
+    // success memory_order::acquire
+    {
+      auto store = [](std::atomic_ref<T> const& x, T, T new_val) { x.store(new_val, std::memory_order::release); };
+
+      auto load = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_strong(val, val, std::memory_order::acquire, std::memory_order::relaxed)) {
+        }
+        return val;
+      };
+      test_acquire_release<T>(store, load);
+
+      auto load_one_arg = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_strong(val, val, std::memory_order::acquire)) {
+        }
+        return val;
+      };
+      test_acquire_release<T>(store, load_one_arg);
+    }
+
+    // success memory_order::acq_rel
+    {
+      auto store = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        auto r = x.compare_exchange_strong(old_val, new_val, std::memory_order::acq_rel, std::memory_order::relaxed);
+        assert(r);
+      };
+      auto load = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_strong(val, val, std::memory_order::acq_rel, std::memory_order::relaxed)) {
+        }
+        return val;
+      };
+      test_acquire_release<T>(store, load);
+
+      auto store_one_arg = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        auto r = x.compare_exchange_strong(old_val, new_val, std::memory_order::acq_rel);
+        assert(r);
+      };
+      auto load_one_arg = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_strong(val, val, std::memory_order::acq_rel)) {
+        }
+        return val;
+      };
+      test_acquire_release<T>(store_one_arg, load_one_arg);
+    }
+
+    // success memory_order::seq_cst
+    {
+      auto store = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        auto r = x.compare_exchange_strong(old_val, new_val, std::memory_order::seq_cst, std::memory_order::relaxed);
+        assert(r);
+      };
+      auto load = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_strong(val, val, std::memory_order::seq_cst, std::memory_order::relaxed)) {
+        }
+        return val;
+      };
+      test_seq_cst<T>(store, load);
+
+      auto store_one_arg = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        auto r = x.compare_exchange_strong(old_val, new_val, std::memory_order::seq_cst);
+        assert(r);
+      };
+      auto load_one_arg = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_strong(val, val, std::memory_order::seq_cst)) {
+        }
+        return val;
+      };
+      test_seq_cst<T>(store_one_arg, load_one_arg);
+    }
+
+    // failure memory_order::acquire
+    {
+      auto store = [](std::atomic_ref<T> const& x, T, T new_val) { x.store(new_val, std::memory_order::release); };
+      auto load  = [](std::atomic_ref<T> const& x) {
+        auto result = x.load(std::memory_order::relaxed);
+        T unexpected(T(255));
+        bool r =
+            x.compare_exchange_strong(unexpected, unexpected, std::memory_order::relaxed, std::memory_order::acquire);
+        assert(!r);
+        return result;
+      };
+      test_acquire_release<T>(store, load);
+
+      auto load_one_arg = [](std::atomic_ref<T> const& x) {
+        auto result = x.load(std::memory_order::relaxed);
+        T unexpected(T(255));
+        bool r = x.compare_exchange_strong(unexpected, unexpected, std::memory_order::acquire);
+        assert(!r);
+        return result;
+      };
+      test_acquire_release<T>(store, load_one_arg);
+
+      // acq_rel replaced by acquire
+      auto load_one_arg_acq_rel = [](std::atomic_ref<T> const& x) {
+        auto result = x.load(std::memory_order::relaxed);
+        T unexpected(T(255));
+        bool r = x.compare_exchange_strong(unexpected, unexpected, std::memory_order::acq_rel);
+        assert(!r);
+        return result;
+      };
+      test_acquire_release<T>(store, load_one_arg_acq_rel);
+    }
+
+    // failure memory_order::seq_cst
+    {
+      auto store = [](std::atomic_ref<T> const& x, T, T new_val) { x.store(new_val, std::memory_order::seq_cst); };
+      auto load  = [](std::atomic_ref<T> const& x) {
+        auto result = x.load(std::memory_order::relaxed);
+        T unexpected(T(255));
+        bool r =
+            x.compare_exchange_strong(unexpected, unexpected, std::memory_order::relaxed, std::memory_order::seq_cst);
+        assert(!r);
+        return result;
+      };
+      test_seq_cst<T>(store, load);
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestCompareExchangeStrong>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp b/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp
new file mode 100644
index 000000000000..5219a8e3714f
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp
@@ -0,0 +1,226 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+// XFAIL: !has-1024-bit-atomics
+
+// bool compare_exchange_weak(T&, T, memory_order, memory_order) const noexcept;
+// bool compare_exchange_weak(T&, T, memory_order = memory_order::seq_cst) const noexcept;
+
+#include <atomic>
+#include <concepts>
+#include <cassert>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_helper.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestCompareExchangeWeak {
+  void operator()() const {
+    {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      T t(T(1));
+      std::same_as<bool> decltype(auto) y = a.compare_exchange_weak(t, T(2));
+      assert(y == true);
+      assert(a == T(2));
+      assert(t == T(1));
+      y = a.compare_exchange_weak(t, T(3));
+      assert(y == false);
+      assert(a == T(2));
+      assert(t == T(2));
+
+      ASSERT_NOEXCEPT(a.compare_exchange_weak(t, T(2)));
+    }
+    {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      T t(T(1));
+      std::same_as<bool> decltype(auto) y = a.compare_exchange_weak(t, T(2), std::memory_order_seq_cst);
+      assert(y == true);
+      assert(a == T(2));
+      assert(t == T(1));
+      y = a.compare_exchange_weak(t, T(3), std::memory_order_seq_cst);
+      assert(y == false);
+      assert(a == T(2));
+      assert(t == T(2));
+
+      ASSERT_NOEXCEPT(a.compare_exchange_weak(t, T(2), std::memory_order_seq_cst));
+    }
+    {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      T t(T(1));
+      std::same_as<bool> decltype(auto) y =
+          a.compare_exchange_weak(t, T(2), std::memory_order_release, std::memory_order_relaxed);
+      assert(y == true);
+      assert(a == T(2));
+      assert(t == T(1));
+      y = a.compare_exchange_weak(t, T(3), std::memory_order_release, std::memory_order_relaxed);
+      assert(y == false);
+      assert(a == T(2));
+      assert(t == T(2));
+
+      ASSERT_NOEXCEPT(a.compare_exchange_weak(t, T(2), std::memory_order_release, std::memory_order_relaxed));
+    }
+
+    // success memory_order::release
+    {
+      auto store = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        // could fail spuriously, so put it in a loop
+        while (!x.compare_exchange_weak(old_val, new_val, std::memory_order::release, std::memory_order::relaxed)) {
+        }
+      };
+
+      auto load = [](std::atomic_ref<T> const& x) { return x.load(std::memory_order::acquire); };
+      test_acquire_release<T>(store, load);
+
+      auto store_one_arg = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        // could fail spuriously, so put it in a loop
+        while (!x.compare_exchange_weak(old_val, new_val, std::memory_order::release)) {
+        }
+      };
+      test_acquire_release<T>(store_one_arg, load);
+    }
+
+    // success memory_order::acquire
+    {
+      auto store = [](std::atomic_ref<T> const& x, T, T new_val) { x.store(new_val, std::memory_order::release); };
+      auto load  = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_weak(val, val, std::memory_order::acquire, std::memory_order::relaxed)) {
+        }
+        return val;
+      };
+      test_acquire_release<T>(store, load);
+
+      auto load_one_arg = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_weak(val, val, std::memory_order::acquire)) {
+        }
+        return val;
+      };
+      test_acquire_release<T>(store, load_one_arg);
+    }
+
+    // success memory_order::acq_rel
+    {
+      auto store = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        // could fail spuriously, so put it in a loop
+        while (!x.compare_exchange_weak(old_val, new_val, std::memory_order::acq_rel, std::memory_order::relaxed)) {
+        }
+      };
+      auto load = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_weak(val, val, std::memory_order::acq_rel, std::memory_order::relaxed)) {
+        }
+        return val;
+      };
+      test_acquire_release<T>(store, load);
+
+      auto store_one_arg = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        // could fail spuriously, so put it in a loop
+        while (!x.compare_exchange_weak(old_val, new_val, std::memory_order::acq_rel)) {
+        }
+      };
+      auto load_one_arg = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_weak(val, val, std::memory_order::acq_rel)) {
+        }
+        return val;
+      };
+      test_acquire_release<T>(store_one_arg, load_one_arg);
+    }
+
+    // success memory_order::seq_cst
+    {
+      auto store = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        // could fail spuriously, so put it in a loop
+        while (!x.compare_exchange_weak(old_val, new_val, std::memory_order::seq_cst, std::memory_order::relaxed)) {
+        }
+      };
+      auto load = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_weak(val, val, std::memory_order::seq_cst, std::memory_order::relaxed)) {
+        }
+        return val;
+      };
+      test_seq_cst<T>(store, load);
+
+      auto store_one_arg = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        // could fail spuriously, so put it in a loop
+        while (!x.compare_exchange_weak(old_val, new_val, std::memory_order::seq_cst)) {
+        }
+      };
+      auto load_one_arg = [](std::atomic_ref<T> const& x) {
+        auto val = x.load(std::memory_order::relaxed);
+        while (!x.compare_exchange_weak(val, val, std::memory_order::seq_cst)) {
+        }
+        return val;
+      };
+      test_seq_cst<T>(store_one_arg, load_one_arg);
+    }
+
+    // failure memory_order::acquire
+    {
+      auto store = [](std::atomic_ref<T> const& x, T, T new_val) { x.store(new_val, std::memory_order::release); };
+      auto load  = [](std::atomic_ref<T> const& x) {
+        auto result = x.load(std::memory_order::relaxed);
+        T unexpected(T(255));
+        bool r =
+            x.compare_exchange_weak(unexpected, unexpected, std::memory_order::relaxed, std::memory_order::acquire);
+        assert(!r);
+        return result;
+      };
+      test_acquire_release<T>(store, load);
+
+      auto load_one_arg = [](std::atomic_ref<T> const& x) {
+        auto result = x.load(std::memory_order::relaxed);
+        T unexpected(T(255));
+        bool r = x.compare_exchange_weak(unexpected, unexpected, std::memory_order::acquire);
+        assert(!r);
+        return result;
+      };
+      test_acquire_release<T>(store, load_one_arg);
+
+      // acq_rel replaced by acquire
+      auto load_one_arg_acq_rel = [](std::atomic_ref<T> const& x) {
+        auto result = x.load(std::memory_order::relaxed);
+        T unexpected(T(255));
+        bool r = x.compare_exchange_weak(unexpected, unexpected, std::memory_order::acq_rel);
+        assert(!r);
+        return result;
+      };
+      test_acquire_release<T>(store, load_one_arg_acq_rel);
+    }
+
+    // failure memory_order::seq_cst
+    {
+      auto store = [](std::atomic_ref<T> const& x, T, T new_val) { x.store(new_val, std::memory_order::seq_cst); };
+      auto load  = [](std::atomic_ref<T> const& x) {
+        auto result = x.load(std::memory_order::relaxed);
+        T unexpected(T(255));
+        bool r =
+            x.compare_exchange_weak(unexpected, unexpected, std::memory_order::relaxed, std::memory_order::seq_cst);
+        assert(!r);
+        return result;
+      };
+      test_seq_cst<T>(store, load);
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestCompareExchangeWeak>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/convert.pass.cpp b/libcxx/test/std/atomics/atomics.ref/convert.pass.cpp
new file mode 100644
index 000000000000..2a58a5ea6ae2
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/convert.pass.cpp
@@ -0,0 +1,45 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+// XFAIL: !has-1024-bit-atomics
+
+// operator T() const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_helper.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestConvert {
+  void operator()() const {
+    T x(T(1));
+
+    T copy = x;
+    std::atomic_ref<T> const a(copy);
+
+    T converted = a;
+    assert(converted == x);
+
+    ASSERT_NOEXCEPT(T(a));
+    static_assert(std::is_nothrow_convertible_v<std::atomic_ref<T>, T>);
+
+    auto store = [](std::atomic_ref<T> const& y, T, T new_val) { y.store(new_val); };
+    auto load  = [](std::atomic_ref<T> const& y) { return static_cast<T>(y); };
+    test_seq_cst<T>(store, load);
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestConvert>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/ctor.pass.cpp b/libcxx/test/std/atomics/atomics.ref/ctor.pass.cpp
new file mode 100644
index 000000000000..d6c647406abf
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/ctor.pass.cpp
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// <atomic>
+
+// explicit atomic_ref(T&);
+
+#include <atomic>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestCtor {
+  void operator()() const {
+    // check that the constructor is explicit
+    static_assert(!std::is_convertible_v<T, std::atomic_ref<T>>);
+    static_assert(std::is_constructible_v<std::atomic_ref<T>, T&>);
+
+    T x(T(0));
+    std::atomic_ref<T> a(x);
+    (void)a;
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestCtor>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/deduction.pass.cpp b/libcxx/test/std/atomics/atomics.ref/deduction.pass.cpp
new file mode 100644
index 000000000000..24a399ac4711
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/deduction.pass.cpp
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// <atomic>
+
+// explicit atomic_ref(T&);
+
+#include <atomic>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestDeduction {
+  void operator()() const {
+    T x(T(0));
+    std::atomic_ref a(x);
+    ASSERT_SAME_TYPE(decltype(a), std::atomic_ref<T>);
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestDeduction>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/exchange.pass.cpp b/libcxx/test/std/atomics/atomics.ref/exchange.pass.cpp
new file mode 100644
index 000000000000..cd998d46b7e8
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/exchange.pass.cpp
@@ -0,0 +1,45 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+// XFAIL: !has-1024-bit-atomics
+
+// T exchange(T, memory_order = memory_order::seq_cst) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestExchange {
+  void operator()() const {
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    {
+      std::same_as<T> decltype(auto) y = a.exchange(T(2));
+      assert(y == T(1));
+      ASSERT_NOEXCEPT(a.exchange(T(2)));
+    }
+
+    {
+      std::same_as<T> decltype(auto) y = a.exchange(T(3), std::memory_order_seq_cst);
+      assert(y == T(2));
+      ASSERT_NOEXCEPT(a.exchange(T(3), std::memory_order_seq_cst));
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestExchange>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/fetch_add.pass.cpp b/libcxx/test/std/atomics/atomics.ref/fetch_add.pass.cpp
new file mode 100644
index 000000000000..908a6879bd06
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/fetch_add.pass.cpp
@@ -0,0 +1,113 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+
+// integral-type fetch_add(integral-type, memory_order = memory_order::seq_cst) const noexcept;
+// floating-point-type fetch_add(floating-point-type, memory_order = memory_order::seq_cst) const noexcept;
+// T* fetch_add(difference_type, memory_order = memory_order::seq_cst) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_helper.h"
+#include "test_macros.h"
+
+template <typename T>
+concept has_fetch_add = requires {
+  std::declval<T const>().fetch_add(std::declval<T>());
+  std::declval<T const>().fetch_add(std::declval<T>(), std::declval<std::memory_order>());
+};
+
+template <typename T>
+struct TestDoesNotHaveFetchAdd {
+  void operator()() const { static_assert(!has_fetch_add<std::atomic_ref<T>>); }
+};
+
+template <typename T>
+struct TestFetchAdd {
+  void operator()() const {
+    if constexpr (std::is_arithmetic_v<T>) {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      {
+        std::same_as<T> decltype(auto) y = a.fetch_add(T(2));
+        assert(y == T(1));
+        assert(x == T(3));
+        ASSERT_NOEXCEPT(a.fetch_add(T(0)));
+      }
+
+      {
+        std::same_as<T> decltype(auto) y = a.fetch_add(T(4), std::memory_order_relaxed);
+        assert(y == T(3));
+        assert(x == T(7));
+        ASSERT_NOEXCEPT(a.fetch_add(T(0), std::memory_order_relaxed));
+      }
+    } else if constexpr (std::is_pointer_v<T>) {
+      using U = std::remove_pointer_t<T>;
+      U t[9]  = {};
+      T p{&t[1]};
+      std::atomic_ref<T> const a(p);
+
+      {
+        std::same_as<T> decltype(auto) y = a.fetch_add(2);
+        assert(y == &t[1]);
+        assert(a == &t[3]);
+        ASSERT_NOEXCEPT(a.fetch_add(0));
+      }
+
+      {
+        std::same_as<T> decltype(auto) y = a.fetch_add(4, std::memory_order_relaxed);
+        assert(y == &t[3]);
+        assert(a == &t[7]);
+        ASSERT_NOEXCEPT(a.fetch_add(0, std::memory_order_relaxed));
+      }
+    } else {
+      static_assert(std::is_void_v<T>);
+    }
+
+    // memory_order::release
+    {
+      auto fetch_add = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        x.fetch_add(new_val - old_val, std::memory_order::release);
+      };
+      auto load = [](std::atomic_ref<T> const& x) { return x.load(std::memory_order::acquire); };
+      test_acquire_release<T>(fetch_add, load);
+    }
+
+    // memory_order::seq_cst
+    {
+      auto fetch_add_no_arg = [](std::atomic_ref<T> const& x, T old_val, T new_val) { x.fetch_add(new_val - old_val); };
+      auto fetch_add_with_order = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        x.fetch_add(new_val - old_val, std::memory_order::seq_cst);
+      };
+      auto load = [](std::atomic_ref<T> const& x) { return x.load(); };
+      test_seq_cst<T>(fetch_add_no_arg, load);
+      test_seq_cst<T>(fetch_add_with_order, load);
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachIntegralType<TestFetchAdd>()();
+
+  TestFetchAdd<float>()();
+  TestFetchAdd<double>()();
+
+  TestEachPointerType<TestFetchAdd>()();
+
+  TestDoesNotHaveFetchAdd<bool>()();
+  TestDoesNotHaveFetchAdd<UserAtomicType>()();
+  TestDoesNotHaveFetchAdd<LargeUserAtomicType>()();
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/fetch_and.pass.cpp b/libcxx/test/std/atomics/atomics.ref/fetch_and.pass.cpp
new file mode 100644
index 000000000000..8f0bec21fe72
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/fetch_and.pass.cpp
@@ -0,0 +1,69 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+
+// integral-type fetch_and(integral-type, memory_order = memory_order::seq_cst) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_macros.h"
+
+template <typename T>
+concept has_fetch_and = requires {
+  std::declval<T const>().fetch_and(std::declval<T>());
+  std::declval<T const>().fetch_and(std::declval<T>(), std::declval<std::memory_order>());
+};
+
+template <typename T>
+struct TestDoesNotHaveFetchAnd {
+  void operator()() const { static_assert(!has_fetch_and<std::atomic_ref<T>>); }
+};
+
+template <typename T>
+struct TestFetchAnd {
+  void operator()() const {
+    static_assert(std::is_integral_v<T>);
+
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    {
+      std::same_as<T> decltype(auto) y = a.fetch_and(T(2));
+      assert(y == T(1));
+      assert(x == T(0));
+      ASSERT_NOEXCEPT(a.fetch_and(T(0)));
+    }
+
+    x = T(1);
+
+    {
+      std::same_as<T> decltype(auto) y = a.fetch_and(T(2), std::memory_order_relaxed);
+      assert(y == T(1));
+      assert(x == T(0));
+      ASSERT_NOEXCEPT(a.fetch_and(T(0), std::memory_order_relaxed));
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachIntegralType<TestFetchAnd>()();
+
+  TestEachFloatingPointType<TestDoesNotHaveFetchAnd>()();
+
+  TestEachPointerType<TestDoesNotHaveFetchAnd>()();
+
+  TestDoesNotHaveFetchAnd<bool>()();
+  TestDoesNotHaveFetchAnd<UserAtomicType>()();
+  TestDoesNotHaveFetchAnd<LargeUserAtomicType>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/fetch_or.pass.cpp b/libcxx/test/std/atomics/atomics.ref/fetch_or.pass.cpp
new file mode 100644
index 000000000000..2045868fde42
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/fetch_or.pass.cpp
@@ -0,0 +1,68 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+
+// integral-type fetch_or(integral-type, memory_order = memory_order::seq_cst) const noexcept;
+
+#include <atomic>
+#include <concepts>
+#include <cassert>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_macros.h"
+
+template <typename T>
+concept has_fetch_or = requires {
+  std::declval<T const>().fetch_or(std::declval<T>());
+  std::declval<T const>().fetch_or(std::declval<T>(), std::declval<std::memory_order>());
+};
+
+template <typename T>
+struct TestDoesNotHaveFetchOr {
+  void operator()() const { static_assert(!has_fetch_or<std::atomic_ref<T>>); }
+};
+
+template <typename T>
+struct TestFetchOr {
+  void operator()() const {
+    static_assert(std::is_integral_v<T>);
+
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    {
+      std::same_as<T> decltype(auto) y = a.fetch_or(T(2));
+      assert(y == T(1));
+      assert(x == T(3));
+      ASSERT_NOEXCEPT(a.fetch_or(T(0)));
+    }
+
+    {
+      std::same_as<T> decltype(auto) y = a.fetch_or(T(2), std::memory_order_relaxed);
+      assert(y == T(3));
+      assert(x == T(3));
+      ASSERT_NOEXCEPT(a.fetch_or(T(0), std::memory_order_relaxed));
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachIntegralType<TestFetchOr>()();
+
+  TestEachFloatingPointType<TestDoesNotHaveFetchOr>()();
+
+  TestEachPointerType<TestDoesNotHaveFetchOr>()();
+
+  TestDoesNotHaveFetchOr<bool>()();
+  TestDoesNotHaveFetchOr<UserAtomicType>()();
+  TestDoesNotHaveFetchOr<LargeUserAtomicType>()();
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/fetch_sub.pass.cpp b/libcxx/test/std/atomics/atomics.ref/fetch_sub.pass.cpp
new file mode 100644
index 000000000000..545604530ada
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/fetch_sub.pass.cpp
@@ -0,0 +1,113 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+
+// integral-type fetch_sub(integral-type, memory_order = memory_order::seq_cst) const noexcept;
+// floating-point-type fetch_sub(floating-point-type, memory_order = memory_order::seq_cst) const noexcept;
+// T* fetch_sub(difference_type, memory_order = memory_order::seq_cst) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_helper.h"
+#include "test_macros.h"
+
+template <typename T>
+concept has_fetch_sub = requires {
+  std::declval<T const>().fetch_sub(std::declval<T>());
+  std::declval<T const>().fetch_sub(std::declval<T>(), std::declval<std::memory_order>());
+};
+
+template <typename T>
+struct TestDoesNotHaveFetchSub {
+  void operator()() const { static_assert(!has_fetch_sub<std::atomic_ref<T>>); }
+};
+
+template <typename T>
+struct TestFetchSub {
+  void operator()() const {
+    if constexpr (std::is_arithmetic_v<T>) {
+      T x(T(7));
+      std::atomic_ref<T> const a(x);
+
+      {
+        std::same_as<T> decltype(auto) y = a.fetch_sub(T(4));
+        assert(y == T(7));
+        assert(x == T(3));
+        ASSERT_NOEXCEPT(a.fetch_sub(T(0)));
+      }
+
+      {
+        std::same_as<T> decltype(auto) y = a.fetch_sub(T(2), std::memory_order_relaxed);
+        assert(y == T(3));
+        assert(x == T(1));
+        ASSERT_NOEXCEPT(a.fetch_sub(T(0), std::memory_order_relaxed));
+      }
+    } else if constexpr (std::is_pointer_v<T>) {
+      using U = std::remove_pointer_t<T>;
+      U t[9]  = {};
+      T p{&t[7]};
+      std::atomic_ref<T> const a(p);
+
+      {
+        std::same_as<T> decltype(auto) y = a.fetch_sub(4);
+        assert(y == &t[7]);
+        assert(a == &t[3]);
+        ASSERT_NOEXCEPT(a.fetch_sub(0));
+      }
+
+      {
+        std::same_as<T> decltype(auto) y = a.fetch_sub(2, std::memory_order_relaxed);
+        assert(y == &t[3]);
+        assert(a == &t[1]);
+        ASSERT_NOEXCEPT(a.fetch_sub(0, std::memory_order_relaxed));
+      }
+    } else {
+      static_assert(std::is_void_v<T>);
+    }
+
+    // memory_order::release
+    {
+      auto fetch_sub = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        x.fetch_sub(old_val - new_val, std::memory_order::release);
+      };
+      auto load = [](std::atomic_ref<T> const& x) { return x.load(std::memory_order::acquire); };
+      test_acquire_release<T>(fetch_sub, load);
+    }
+
+    // memory_order::seq_cst
+    {
+      auto fetch_sub_no_arg = [](std::atomic_ref<T> const& x, T old_val, T new_val) { x.fetch_sub(old_val - new_val); };
+      auto fetch_sub_with_order = [](std::atomic_ref<T> const& x, T old_val, T new_val) {
+        x.fetch_sub(old_val - new_val, std::memory_order::seq_cst);
+      };
+      auto load = [](std::atomic_ref<T> const& x) { return x.load(); };
+      test_seq_cst<T>(fetch_sub_no_arg, load);
+      test_seq_cst<T>(fetch_sub_with_order, load);
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachIntegralType<TestFetchSub>()();
+
+  TestFetchSub<float>()();
+  TestFetchSub<double>()();
+
+  TestEachPointerType<TestFetchSub>()();
+
+  TestDoesNotHaveFetchSub<bool>()();
+  TestDoesNotHaveFetchSub<UserAtomicType>()();
+  TestDoesNotHaveFetchSub<LargeUserAtomicType>()();
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/fetch_xor.pass.cpp b/libcxx/test/std/atomics/atomics.ref/fetch_xor.pass.cpp
new file mode 100644
index 000000000000..aade87f961f1
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/fetch_xor.pass.cpp
@@ -0,0 +1,68 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+
+// integral-type fetch_xor(integral-type, memory_order = memory_order::seq_cst) const noexcept;
+
+#include <atomic>
+#include <concepts>
+#include <cassert>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_macros.h"
+
+template <typename T>
+concept has_fetch_xor = requires {
+  std::declval<T const>().fetch_xor(std::declval<T>());
+  std::declval<T const>().fetch_xor(std::declval<T>(), std::declval<std::memory_order>());
+};
+
+template <typename T>
+struct TestDoesNotHaveFetchXor {
+  void operator()() const { static_assert(!has_fetch_xor<std::atomic_ref<T>>); }
+};
+
+template <typename T>
+struct TestFetchXor {
+  void operator()() const {
+    static_assert(std::is_integral_v<T>);
+
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    {
+      std::same_as<T> decltype(auto) y = a.fetch_xor(T(2));
+      assert(y == T(1));
+      assert(x == T(3));
+      ASSERT_NOEXCEPT(a.fetch_xor(T(0)));
+    }
+
+    {
+      std::same_as<T> decltype(auto) y = a.fetch_xor(T(2), std::memory_order_relaxed);
+      assert(y == T(3));
+      assert(x == T(1));
+      ASSERT_NOEXCEPT(a.fetch_xor(T(0), std::memory_order_relaxed));
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachIntegralType<TestFetchXor>()();
+
+  TestEachFloatingPointType<TestDoesNotHaveFetchXor>()();
+
+  TestEachPointerType<TestDoesNotHaveFetchXor>()();
+
+  TestDoesNotHaveFetchXor<bool>()();
+  TestDoesNotHaveFetchXor<UserAtomicType>()();
+  TestDoesNotHaveFetchXor<LargeUserAtomicType>()();
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/increment_decrement.pass.cpp b/libcxx/test/std/atomics/atomics.ref/increment_decrement.pass.cpp
new file mode 100644
index 000000000000..c84c89b4d2b4
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/increment_decrement.pass.cpp
@@ -0,0 +1,97 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+
+// integral-type operator++(int) const noexcept;
+// integral-type operator--(int) const noexcept;
+// integral-type operator++() const noexcept;
+// integral-type operator--() const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_macros.h"
+
+template <typename T>
+concept has_pre_increment_operator = requires { ++std::declval<T const>(); };
+
+template <typename T>
+concept has_post_increment_operator = requires { std::declval<T const>()++; };
+
+template <typename T>
+concept has_pre_decrement_operator = requires { --std::declval<T const>(); };
+
+template <typename T>
+concept has_post_decrement_operator = requires { std::declval<T const>()--; };
+
+template <typename T>
+constexpr bool does_not_have_increment_nor_decrement_operators() {
+  return !has_pre_increment_operator<T> && !has_pre_decrement_operator<T> && !has_post_increment_operator<T> &&
+         !has_post_decrement_operator<T>;
+}
+
+template <typename T>
+struct TestDoesNotHaveIncrementDecrement {
+  void operator()() const { static_assert(does_not_have_increment_nor_decrement_operators<T>()); }
+};
+
+template <typename T>
+struct TestIncrementDecrement {
+  void operator()() const {
+    static_assert(std::is_integral_v<T>);
+
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    {
+      std::same_as<T> decltype(auto) y = ++a;
+      assert(y == T(2));
+      assert(x == T(2));
+      ASSERT_NOEXCEPT(++a);
+    }
+
+    {
+      std::same_as<T> decltype(auto) y = --a;
+      assert(y == T(1));
+      assert(x == T(1));
+      ASSERT_NOEXCEPT(--a);
+    }
+
+    {
+      std::same_as<T> decltype(auto) y = a++;
+      assert(y == T(1));
+      assert(x == T(2));
+      ASSERT_NOEXCEPT(a++);
+    }
+
+    {
+      std::same_as<T> decltype(auto) y = a--;
+      assert(y == T(2));
+      assert(x == T(1));
+      ASSERT_NOEXCEPT(a--);
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachIntegralType<TestIncrementDecrement>()();
+
+  TestEachFloatingPointType<TestDoesNotHaveIncrementDecrement>()();
+
+  TestEachPointerType<TestDoesNotHaveIncrementDecrement>()();
+
+  TestDoesNotHaveIncrementDecrement<bool>()();
+  TestDoesNotHaveIncrementDecrement<UserAtomicType>()();
+  TestDoesNotHaveIncrementDecrement<LargeUserAtomicType>()();
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/is_always_lock_free.pass.cpp b/libcxx/test/std/atomics/atomics.ref/is_always_lock_free.pass.cpp
new file mode 100644
index 000000000000..94f65e3b4b66
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/is_always_lock_free.pass.cpp
@@ -0,0 +1,71 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// <atomic>
+
+// static constexpr bool is_always_lock_free;
+// bool is_lock_free() const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+
+#include "test_macros.h"
+
+template <typename T>
+void check_always_lock_free(std::atomic_ref<T> const a) {
+  std::same_as<const bool> decltype(auto) is_always_lock_free = std::atomic_ref<T>::is_always_lock_free;
+  if (is_always_lock_free) {
+    std::same_as<bool> decltype(auto) is_lock_free = a.is_lock_free();
+    assert(is_lock_free);
+  }
+  ASSERT_NOEXCEPT(a.is_lock_free());
+}
+
+#define CHECK_ALWAYS_LOCK_FREE(T)                                                                                      \
+  do {                                                                                                                 \
+    typedef T type;                                                                                                    \
+    type obj{};                                                                                                        \
+    check_always_lock_free(std::atomic_ref<type>(obj));                                                                \
+  } while (0)
+
+void test() {
+  int i = 0;
+  check_always_lock_free(std::atomic_ref<int>(i));
+
+  float f = 0.f;
+  check_always_lock_free(std::atomic_ref<float>(f));
+
+  int* p = &i;
+  check_always_lock_free(std::atomic_ref<int*>(p));
+
+  CHECK_ALWAYS_LOCK_FREE(struct Empty{});
+  CHECK_ALWAYS_LOCK_FREE(struct OneInt { int i; });
+  CHECK_ALWAYS_LOCK_FREE(struct IntArr2 { int i[2]; });
+  CHECK_ALWAYS_LOCK_FREE(struct FloatArr3 { float i[3]; });
+  CHECK_ALWAYS_LOCK_FREE(struct LLIArr2 { long long int i[2]; });
+  CHECK_ALWAYS_LOCK_FREE(struct LLIArr4 { long long int i[4]; });
+  CHECK_ALWAYS_LOCK_FREE(struct LLIArr8 { long long int i[8]; });
+  CHECK_ALWAYS_LOCK_FREE(struct LLIArr16 { long long int i[16]; });
+  CHECK_ALWAYS_LOCK_FREE(struct Padding {
+    char c; /* padding */
+    long long int i;
+  });
+  CHECK_ALWAYS_LOCK_FREE(union IntFloat {
+    int i;
+    float f;
+  });
+  CHECK_ALWAYS_LOCK_FREE(enum class CharEnumClass : char{foo});
+}
+
+int main(int, char**) {
+  test();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/load.pass.cpp b/libcxx/test/std/atomics/atomics.ref/load.pass.cpp
new file mode 100644
index 000000000000..feed0fbaed84
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/load.pass.cpp
@@ -0,0 +1,62 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+// XFAIL: !has-1024-bit-atomics
+
+// T load(memory_order = memory_order::seq_cst) const noexcept;
+
+#include <atomic>
+#include <concepts>
+#include <cassert>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_helper.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestLoad {
+  void operator()() const {
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    {
+      std::same_as<T> decltype(auto) y = a.load();
+      assert(y == T(1));
+      ASSERT_NOEXCEPT(a.load());
+    }
+
+    {
+      std::same_as<T> decltype(auto) y = a.load(std::memory_order_seq_cst);
+      assert(y == T(1));
+      ASSERT_NOEXCEPT(a.load(std::memory_order_seq_cst));
+    }
+
+    // memory_order::seq_cst
+    {
+      auto store           = [](std::atomic_ref<T> const& y, T, T new_val) { y.store(new_val); };
+      auto load_no_arg     = [](std::atomic_ref<T> const& y) { return y.load(); };
+      auto load_with_order = [](std::atomic_ref<T> const& y) { return y.load(std::memory_order::seq_cst); };
+      test_seq_cst<T>(store, load_no_arg);
+      test_seq_cst<T>(store, load_with_order);
+    }
+
+    // memory_order::release
+    {
+      auto store = [](std::atomic_ref<T> const& y, T, T new_val) { y.store(new_val, std::memory_order::release); };
+      auto load  = [](std::atomic_ref<T> const& y) { return y.load(std::memory_order::acquire); };
+      test_acquire_release<T>(store, load);
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestLoad>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/member_types.pass.cpp b/libcxx/test/std/atomics/atomics.ref/member_types.pass.cpp
new file mode 100644
index 000000000000..d4e2f0126d62
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/member_types.pass.cpp
@@ -0,0 +1,132 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <atomic>
+
+// template <class T>
+// struct atomic_ref
+// {
+//    using value_type = T;
+//    using difference_type = value_type;      // only for atomic_ref<Integral> and
+//                                             // atomic_ref<Floating> specializations
+//    using difference_type = std::ptrdiff_t;  // only for atomic_ref<T*> specializations
+//
+//    explicit atomic_ref(T&);
+//    atomic_ref(const atomic_ref&) noexcept;
+//    atomic_ref& operator=(const atomic_ref&) = delete;
+// };
+
+#include <atomic>
+#include <type_traits>
+
+#include "test_macros.h"
+
+template <class T>
+concept has_difference_type = requires { typename T::difference_type; };
+
+template <class T>
+void check_member_types() {
+  if constexpr ((std::is_integral_v<T> && !std::is_same_v<T, bool>) || std::is_floating_point_v<T>) {
+    ASSERT_SAME_TYPE(typename std::atomic_ref<T>::value_type, T);
+    ASSERT_SAME_TYPE(typename std::atomic_ref<T>::difference_type, T);
+  } else if constexpr (std::is_pointer_v<T>) {
+    ASSERT_SAME_TYPE(typename std::atomic_ref<T>::value_type, T);
+    ASSERT_SAME_TYPE(typename std::atomic_ref<T>::difference_type, std::ptrdiff_t);
+  } else {
+    ASSERT_SAME_TYPE(typename std::atomic_ref<T>::value_type, T);
+    static_assert(!has_difference_type<std::atomic_ref<T>>);
+  }
+}
+
+template <class T>
+void test() {
+  // value_type and difference_type (except for primary template)
+  check_member_types<T>();
+
+  static_assert(std::is_nothrow_copy_constructible_v<std::atomic_ref<T>>);
+
+  static_assert(!std::is_copy_assignable_v<std::atomic_ref<T>>);
+
+  // explicit constructor
+  static_assert(!std::is_convertible_v<T, std::atomic_ref<T>>);
+  static_assert(std::is_constructible_v<std::atomic_ref<T>, T&>);
+}
+
+void testall() {
+  // Primary template
+  struct Empty {};
+  test<Empty>();
+  struct Trivial {
+    int a;
+    float b;
+  };
+  test<Trivial>();
+  test<bool>();
+
+  // Partial specialization for pointer types
+  test<void*>();
+
+  // Specialization for integral types
+  // + character types
+  test<char>();
+  test<char8_t>();
+  test<char16_t>();
+  test<char32_t>();
+  test<wchar_t>();
+  // + standard signed integer types
+  test<signed char>();
+  test<short>();
+  test<int>();
+  test<long>();
+  test<long long>();
+  // + standard unsigned integer types
+  test<unsigned char>();
+  test<unsigned short>();
+  test<unsigned int>();
+  test<unsigned long>();
+  test<unsigned long long>();
+  // + any other types needed by the typedefs in the header <cstdint>
+  test<int8_t>();
+  test<int16_t>();
+  test<int32_t>();
+  test<int64_t>();
+  test<int_fast8_t>();
+  test<int_fast16_t>();
+  test<int_fast32_t>();
+  test<int_fast64_t>();
+  test<int_least8_t>();
+  test<int_least16_t>();
+  test<int_least32_t>();
+  test<int_least64_t>();
+  test<intmax_t>();
+  test<intptr_t>();
+  test<uint8_t>();
+  test<uint16_t>();
+  test<uint32_t>();
+  test<uint64_t>();
+  test<uint_fast8_t>();
+  test<uint_fast16_t>();
+  test<uint_fast32_t>();
+  test<uint_fast64_t>();
+  test<uint_least8_t>();
+  test<uint_least16_t>();
+  test<uint_least32_t>();
+  test<uint_least64_t>();
+  test<uintmax_t>();
+  test<uintptr_t>();
+
+  // Specialization for floating-point types
+  // + floating-point types
+  test<float>();
+  test<double>();
+  test<long double>();
+  // + TODO extended floating-point types
+}
+
+int main(int, char**) { return 0; }
diff --git a/libcxx/test/std/atomics/atomics.ref/notify_all.pass.cpp b/libcxx/test/std/atomics/atomics.ref/notify_all.pass.cpp
new file mode 100644
index 000000000000..382b19f8c1d7
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/notify_all.pass.cpp
@@ -0,0 +1,78 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-threads
+// XFAIL: availability-synchronization_library-missing
+// XFAIL: !has-64-bit-atomics
+// XFAIL: !has-1024-bit-atomics
+
+// void notify_all() const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <thread>
+#include <type_traits>
+#include <vector>
+
+#include "atomic_helpers.h"
+#include "make_test_thread.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestNotifyAll {
+  void operator()() const {
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    bool done                      = false;
+    std::atomic<int> started_num   = 0;
+    std::atomic<int> wait_done_num = 0;
+
+    constexpr auto number_of_threads = 8;
+    std::vector<std::thread> threads;
+    threads.reserve(number_of_threads);
+
+    for (auto j = 0; j < number_of_threads; ++j) {
+      threads.push_back(support::make_test_thread([&a, &started_num, &done, &wait_done_num] {
+        started_num.fetch_add(1, std::memory_order::relaxed);
+
+        a.wait(T(1));
+        wait_done_num.fetch_add(1, std::memory_order::relaxed);
+
+        // likely to fail if wait did not block
+        assert(done);
+      }));
+    }
+
+    while (started_num.load(std::memory_order::relaxed) != number_of_threads) {
+      std::this_thread::yield();
+    }
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+
+    done = true;
+    a.store(T(3));
+    a.notify_all();
+
+    // notify_all should unblock all the threads so that the loop below won't stuck
+    while (wait_done_num.load(std::memory_order::relaxed) != number_of_threads) {
+      std::this_thread::yield();
+    }
+
+    for (auto& thread : threads) {
+      thread.join();
+    }
+
+    ASSERT_NOEXCEPT(a.notify_all());
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestNotifyAll>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/notify_one.pass.cpp b/libcxx/test/std/atomics/atomics.ref/notify_one.pass.cpp
new file mode 100644
index 000000000000..611e67417e4d
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/notify_one.pass.cpp
@@ -0,0 +1,46 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-threads
+// XFAIL: availability-synchronization_library-missing
+// XFAIL: !has-64-bit-atomics
+// XFAIL: !has-1024-bit-atomics
+
+// void notify_one() const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <thread>
+#include <type_traits>
+#include <vector>
+
+#include "atomic_helpers.h"
+#include "make_test_thread.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestNotifyOne {
+  void operator()() const {
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    std::thread t = support::make_test_thread([&]() {
+      a.store(T(3));
+      a.notify_one();
+    });
+    a.wait(T(1));
+    assert(a.load() == T(3));
+    t.join();
+    ASSERT_NOEXCEPT(a.notify_one());
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestNotifyOne>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/operator_minus_equals.pass.cpp b/libcxx/test/std/atomics/atomics.ref/operator_minus_equals.pass.cpp
new file mode 100644
index 000000000000..571d626035fa
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/operator_minus_equals.pass.cpp
@@ -0,0 +1,79 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+
+// integral-type operator-=(integral-type) const noexcept;
+// floating-point-type operator-=(floating-point-type) const noexcept;
+// T* operator-=(difference_type) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_helper.h"
+#include "test_macros.h"
+
+template <typename T>
+concept has_operator_minus_equals = requires { std::declval<T const>() -= std::declval<T>(); };
+
+template <typename T>
+struct TestDoesNotHaveOperatorMinusEquals {
+  void operator()() const { static_assert(!has_operator_minus_equals<std::atomic_ref<T>>); }
+};
+
+template <typename T>
+struct TestOperatorMinusEquals {
+  void operator()() const {
+    if constexpr (std::is_arithmetic_v<T>) {
+      T x(T(3));
+      std::atomic_ref<T> const a(x);
+
+      std::same_as<T> decltype(auto) y = (a -= T(2));
+      assert(y == T(1));
+      assert(x == T(1));
+      ASSERT_NOEXCEPT(a -= T(0));
+    } else if constexpr (std::is_pointer_v<T>) {
+      using U = std::remove_pointer_t<T>;
+      U t[9]  = {};
+      T p{&t[3]};
+      std::atomic_ref<T> const a(p);
+
+      std::same_as<T> decltype(auto) y = (a -= 2);
+      assert(y == &t[1]);
+      assert(a == &t[1]);
+      ASSERT_NOEXCEPT(a -= 0);
+    } else {
+      static_assert(std::is_void_v<T>);
+    }
+
+    // memory_order::seq_cst
+    {
+      auto minus_equals = [](std::atomic_ref<T> const& x, T old_val, T new_val) { x -= (old_val - new_val); };
+      auto load         = [](std::atomic_ref<T> const& x) { return x.load(); };
+      test_seq_cst<T>(minus_equals, load);
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachIntegralType<TestOperatorMinusEquals>()();
+
+  TestOperatorMinusEquals<float>()();
+  TestOperatorMinusEquals<double>()();
+
+  TestEachPointerType<TestOperatorMinusEquals>()();
+
+  TestDoesNotHaveOperatorMinusEquals<bool>()();
+  TestDoesNotHaveOperatorMinusEquals<UserAtomicType>()();
+  TestDoesNotHaveOperatorMinusEquals<LargeUserAtomicType>()();
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/operator_plus_equals.pass.cpp b/libcxx/test/std/atomics/atomics.ref/operator_plus_equals.pass.cpp
new file mode 100644
index 000000000000..de48ea56f57f
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/operator_plus_equals.pass.cpp
@@ -0,0 +1,79 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+
+// integral-type operator+=(integral-type) const noexcept;
+// floating-point-type operator+=(floating-point-type) const noexcept;
+// T* operator+=(difference_type) const noexcept;
+
+#include <atomic>
+#include <concepts>
+#include <cassert>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_helper.h"
+#include "test_macros.h"
+
+template <typename T>
+concept has_operator_plus_equals = requires { std::declval<T const>() += std::declval<T>(); };
+
+template <typename T>
+struct TestDoesNotHaveOperatorPlusEquals {
+  void operator()() const { static_assert(!has_operator_plus_equals<std::atomic_ref<T>>); }
+};
+
+template <typename T>
+struct TestOperatorPlusEquals {
+  void operator()() const {
+    if constexpr (std::is_arithmetic_v<T>) {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      std::same_as<T> decltype(auto) y = (a += T(2));
+      assert(y == T(3));
+      assert(x == T(3));
+      ASSERT_NOEXCEPT(a += T(0));
+    } else if constexpr (std::is_pointer_v<T>) {
+      using U = std::remove_pointer_t<T>;
+      U t[9]  = {};
+      T p{&t[1]};
+      std::atomic_ref<T> const a(p);
+
+      std::same_as<T> decltype(auto) y = (a += 2);
+      assert(y == &t[3]);
+      assert(a == &t[3]);
+      ASSERT_NOEXCEPT(a += 0);
+    } else {
+      static_assert(std::is_void_v<T>);
+    }
+
+    // memory_order::seq_cst
+    {
+      auto plus_equals = [](std::atomic_ref<T> const& x, T old_val, T new_val) { x += (new_val - old_val); };
+      auto load        = [](std::atomic_ref<T> const& x) { return x.load(); };
+      test_seq_cst<T>(plus_equals, load);
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachIntegralType<TestOperatorPlusEquals>()();
+
+  TestOperatorPlusEquals<float>()();
+  TestOperatorPlusEquals<double>()();
+
+  TestEachPointerType<TestOperatorPlusEquals>()();
+
+  TestDoesNotHaveOperatorPlusEquals<bool>()();
+  TestDoesNotHaveOperatorPlusEquals<UserAtomicType>()();
+  TestDoesNotHaveOperatorPlusEquals<LargeUserAtomicType>()();
+
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp b/libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp
new file mode 100644
index 000000000000..86e0cba4dbf0
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/required_alignment.pass.cpp
@@ -0,0 +1,39 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// static constexpr size_t required_alignment;
+
+#include <atomic>
+#include <cassert>
+#include <concepts>
+
+template <typename T>
+constexpr void check_required_alignment() {
+  std::same_as<const std::size_t> decltype(auto) required_alignment = std::atomic_ref<T>::required_alignment;
+  assert(required_alignment >= alignof(T));
+}
+
+constexpr bool test() {
+  check_required_alignment<int>();
+  check_required_alignment<float>();
+  check_required_alignment<int*>();
+  struct Empty {};
+  check_required_alignment<Empty>();
+  struct Trivial {
+    int a;
+  };
+  check_required_alignment<Trivial>();
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/requires-trivially-copyable.verify.cpp b/libcxx/test/std/atomics/atomics.ref/requires-trivially-copyable.verify.cpp
new file mode 100644
index 000000000000..9a8b036ffd1f
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/requires-trivially-copyable.verify.cpp
@@ -0,0 +1,26 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// <atomic_ref>
+
+// template<class T>
+// class atomic_ref;
+
+// The program is ill-formed if is_trivially_copyable_v<T> is false.
+
+#include <atomic>
+
+void trivially_copyable() {
+  struct X {
+    X() = default;
+    X(X const&) {} // -> not trivially copyable
+  } x;
+  // expected-error-re@*:* {{static assertion failed {{.*}}atomic_ref<T> requires that 'T' be a trivially copyable type}}
+  std::atomic_ref<X> r(x);
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/store.pass.cpp b/libcxx/test/std/atomics/atomics.ref/store.pass.cpp
new file mode 100644
index 000000000000..ea01a3d02a34
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/store.pass.cpp
@@ -0,0 +1,61 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// XFAIL: !has-64-bit-atomics
+// XFAIL: !has-1024-bit-atomics
+
+// void store(T, memory_order = memory_order::seq_cst) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "test_helper.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestStore {
+  void operator()() const {
+    T x(T(1));
+    std::atomic_ref<T> const a(x);
+
+    a.store(T(2));
+    assert(x == T(2));
+    ASSERT_NOEXCEPT(a.store(T(1)));
+
+    a.store(T(3), std::memory_order_seq_cst);
+    assert(x == T(3));
+    ASSERT_NOEXCEPT(a.store(T(0), std::memory_order_seq_cst));
+
+    // TODO memory_order::relaxed
+
+    // memory_order::seq_cst
+    {
+      auto store_no_arg     = [](std::atomic_ref<T> const& y, T, T new_val) { y.store(new_val); };
+      auto store_with_order = [](std::atomic_ref<T> const& y, T, T new_val) {
+        y.store(new_val, std::memory_order::seq_cst);
+      };
+      auto load = [](std::atomic_ref<T> const& y) { return y.load(); };
+      test_seq_cst<T>(store_no_arg, load);
+      test_seq_cst<T>(store_with_order, load);
+    }
+
+    // memory_order::release
+    {
+      auto store = [](std::atomic_ref<T> const& y, T, T new_val) { y.store(new_val, std::memory_order::release); };
+      auto load  = [](std::atomic_ref<T> const& y) { return y.load(std::memory_order::acquire); };
+      test_acquire_release<T>(store, load);
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestStore>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.ref/test_helper.h b/libcxx/test/std/atomics/atomics.ref/test_helper.h
new file mode 100644
index 000000000000..225a70c5a16c
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/test_helper.h
@@ -0,0 +1,136 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_STD_ATOMICS_ATOMIC_REF_TEST_HELPER_H
+#define TEST_STD_ATOMICS_ATOMIC_REF_TEST_HELPER_H
+
+#include <atomic>
+#include <cassert>
+#include <cmath>
+#include <vector>
+
+#include "test_macros.h"
+
+#ifndef TEST_HAS_NO_THREADS
+#  include "make_test_thread.h"
+#  include <thread>
+#endif
+
+template <class T>
+bool equals(T x, T y) {
+  return x == y;
+}
+
+template <class T>
+T make_value(int i) {
+  assert(i == 0 || i == 1);
+  if constexpr (std::is_pointer_v<T>) {
+    // So that pointers returned can be subtracted from one another
+    static std::remove_const_t<std::remove_pointer_t<T>> d[2];
+    return &d[i];
+  } else {
+    return T(i);
+  }
+}
+
+// Test that all threads see the exact same sequence of events
+// Test will pass 100% if store_op and load_op are correctly
+// affecting the memory with seq_cst order
+template <class T, class StoreOp, class LoadOp>
+void test_seq_cst(StoreOp store_op, LoadOp load_op) {
+#ifndef TEST_HAS_NO_THREADS
+  for (int i = 0; i < 100; ++i) {
+    T old_value(make_value<T>(0));
+    T new_value(make_value<T>(1));
+
+    T copy_x = old_value;
+    std::atomic_ref<T> const x(copy_x);
+    T copy_y = old_value;
+    std::atomic_ref<T> const y(copy_y);
+
+    std::atomic_bool x_updated_first(false);
+    std::atomic_bool y_updated_first(false);
+
+    auto t1 = support::make_test_thread([&] { store_op(x, old_value, new_value); });
+
+    auto t2 = support::make_test_thread([&] { store_op(y, old_value, new_value); });
+
+    auto t3 = support::make_test_thread([&] {
+      while (!equals(load_op(x), new_value)) {
+        std::this_thread::yield();
+      }
+      if (!equals(load_op(y), new_value)) {
+        x_updated_first.store(true, std::memory_order_relaxed);
+      }
+    });
+
+    auto t4 = support::make_test_thread([&] {
+      while (!equals(load_op(y), new_value)) {
+        std::this_thread::yield();
+      }
+      if (!equals(load_op(x), new_value)) {
+        y_updated_first.store(true, std::memory_order_relaxed);
+      }
+    });
+
+    t1.join();
+    t2.join();
+    t3.join();
+    t4.join();
+    // thread 3 and thread 4 cannot see different orders of storing x and y
+    assert(!(x_updated_first && y_updated_first));
+  }
+#else
+  (void)store_op;
+  (void)load_op;
+#endif
+}
+
+// Test that all writes before the store are seen by other threads after the load
+// Test will pass 100% if store_op and load_op are correctly
+// affecting the memory with acquire-release order
+template <class T, class StoreOp, class LoadOp>
+void test_acquire_release(StoreOp store_op, LoadOp load_op) {
+#ifndef TEST_HAS_NO_THREADS
+  for (auto i = 0; i < 100; ++i) {
+    T old_value(make_value<T>(0));
+    T new_value(make_value<T>(1));
+
+    T copy = old_value;
+    std::atomic_ref<T> const at(copy);
+    int non_atomic = 5;
+
+    constexpr auto number_of_threads = 8;
+    std::vector<std::thread> threads;
+    threads.reserve(number_of_threads);
+
+    for (auto j = 0; j < number_of_threads; ++j) {
+      threads.push_back(support::make_test_thread([&at, &non_atomic, load_op, new_value] {
+        while (!equals(load_op(at), new_value)) {
+          std::this_thread::yield();
+        }
+        // Other thread's writes before the release store are visible
+        // in this thread's read after the acquire load
+        assert(non_atomic == 6);
+      }));
+    }
+
+    non_atomic = 6;
+    store_op(at, old_value, new_value);
+
+    for (auto& thread : threads) {
+      thread.join();
+    }
+  }
+#else
+  (void)store_op;
+  (void)load_op;
+#endif
+}
+
+#endif // TEST_STD_ATOMICS_ATOMIC_REF_TEST_HELPER_H
diff --git a/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp b/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp
new file mode 100644
index 000000000000..e5310febf5c5
--- /dev/null
+++ b/libcxx/test/std/atomics/atomics.ref/wait.pass.cpp
@@ -0,0 +1,88 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-threads
+// XFAIL: availability-synchronization_library-missing
+// XFAIL: !has-64-bit-atomics
+// XFAIL: !has-1024-bit-atomics
+
+// void wait(T, memory_order = memory_order::seq_cst) const noexcept;
+
+#include <atomic>
+#include <cassert>
+#include <type_traits>
+
+#include "atomic_helpers.h"
+#include "make_test_thread.h"
+#include "test_helper.h"
+#include "test_macros.h"
+
+template <typename T>
+struct TestWait {
+  void operator()() const {
+    {
+      T x(T(1));
+      std::atomic_ref<T> const a(x);
+
+      assert(a.load() == T(1));
+      a.wait(T(0));
+      std::thread t1 = support::make_test_thread([&]() {
+        a.store(T(3));
+        a.notify_one();
+      });
+      a.wait(T(1));
+      assert(a.load() == T(3));
+      t1.join();
+      ASSERT_NOEXCEPT(a.wait(T(0)));
+
+      assert(a.load() == T(3));
+      a.wait(T(0), std::memory_order_seq_cst);
+      std::thread t2 = support::make_test_thread([&]() {
+        a.store(T(5));
+        a.notify_one();
+      });
+      a.wait(T(3), std::memory_order_seq_cst);
+      assert(a.load() == T(5));
+      t2.join();
+      ASSERT_NOEXCEPT(a.wait(T(0), std::memory_order_seq_cst));
+    }
+
+    // memory_order::acquire
+    {
+      auto store = [](std::atomic_ref<T> const& x, T, T new_val) { x.store(new_val, std::memory_order::release); };
+      auto load  = [](std::atomic_ref<T> const& x) {
+        auto result = x.load(std::memory_order::relaxed);
+        x.wait(T(255), std::memory_order::acquire);
+        return result;
+      };
+      test_acquire_release<T>(store, load);
+    }
+
+    // memory_order::seq_cst
+    {
+      auto store       = [](std::atomic_ref<T> const& x, T, T new_val) { x.store(new_val); };
+      auto load_no_arg = [](std::atomic_ref<T> const& x) {
+        auto result = x.load(std::memory_order::relaxed);
+        x.wait(T(255));
+        return result;
+      };
+      auto load_with_order = [](std::atomic_ref<T> const& x) {
+        auto result = x.load(std::memory_order::relaxed);
+        x.wait(T(255), std::memory_order::seq_cst);
+        return result;
+      };
+      test_seq_cst<T>(store, load_no_arg);
+      test_seq_cst<T>(store, load_with_order);
+    }
+  }
+};
+
+int main(int, char**) {
+  TestEachAtomicType<TestWait>()();
+  return 0;
+}
diff --git a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/types.compile.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/types.compile.pass.cpp
index 1a4e6dfe0b31..b38123628fe0 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/types.compile.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/types.compile.pass.cpp
@@ -17,8 +17,11 @@
 
 template <class T>
 void test() {
+  // LWG 3045. atomic<floating-point> doesn't have value_type or difference_type
+  // https://cplusplus.github.io/LWG/issue3045
   static_assert(std::is_same_v<typename std::atomic<T>::value_type, T>);
   static_assert(std::is_same_v<typename std::atomic<T>::difference_type, T>);
+
   static_assert(std::is_standard_layout_v<std::atomic<T>>);
   static_assert(std::is_trivially_destructible_v<std::atomic<T>>);
 }
diff --git a/libcxx/test/std/containers/associative/map/map.value_compare/types.pass.cpp b/libcxx/test/std/containers/associative/map/map.value_compare/types.pass.cpp
index 1d6069933eea..52111ddb3f27 100644
--- a/libcxx/test/std/containers/associative/map/map.value_compare/types.pass.cpp
+++ b/libcxx/test/std/containers/associative/map/map.value_compare/types.pass.cpp
@@ -10,7 +10,7 @@
 
 // class value_compare
 
-// REQUIRES: c++98 || c++03 || c++11 || c++14
+// REQUIRES: c++03 || c++11 || c++14
 
 #include <map>
 #include <string>
diff --git a/libcxx/test/std/containers/associative/multimap/multimap.value_compare/types.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.value_compare/types.pass.cpp
index 6ecaf9247ebe..0d0c74f29f42 100644
--- a/libcxx/test/std/containers/associative/multimap/multimap.value_compare/types.pass.cpp
+++ b/libcxx/test/std/containers/associative/multimap/multimap.value_compare/types.pass.cpp
@@ -10,7 +10,7 @@
 
 // class value_compare
 
-// REQUIRES: c++98 || c++03 || c++11 || c++14
+// REQUIRES: c++03 || c++11 || c++14
 
 #include <map>
 #include <string>
diff --git a/libcxx/test/std/experimental/simd/simd.class/simd_copy.pass.cpp b/libcxx/test/std/experimental/simd/simd.class/simd_copy.pass.cpp
new file mode 100644
index 000000000000..8fcc811f6df3
--- /dev/null
+++ b/libcxx/test/std/experimental/simd/simd.class/simd_copy.pass.cpp
@@ -0,0 +1,173 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// FIXME: Fatal error with following targets (remove XFAIL when fixed):
+//   Pass-by-value arguments with alignment greater than register width are not supported.
+// XFAIL: target=powerpc{{.*}}-ibm-aix7.2.5.7
+
+// <experimental/simd>
+//
+// [simd.class]
+// template<class U, class Flags> void copy_from(const U* mem, Flags);
+// template<class U, class Flags> void copy_to(U* mem, Flags) const;
+
+#include "../test_utils.h"
+
+namespace ex = std::experimental::parallelism_v2;
+
+template <class T, class SimdAbi, std::size_t array_size>
+struct ElementAlignedCopyFromHelper {
+  template <class U>
+  void operator()() const {
+    U buffer[array_size];
+    for (size_t i = 0; i < array_size; ++i)
+      buffer[i] = static_cast<U>(i);
+    ex::simd<T, SimdAbi> origin_simd;
+    origin_simd.copy_from(buffer, ex::element_aligned_tag());
+    assert_simd_values_equal(origin_simd, buffer);
+  }
+};
+
+template <class T, class SimdAbi, std::size_t array_size>
+struct VectorAlignedCopyFromHelper {
+  template <class U>
+  void operator()() const {
+    alignas(ex::memory_alignment_v<ex::simd<T, SimdAbi>, U>) U buffer[array_size];
+    for (size_t i = 0; i < array_size; ++i)
+      buffer[i] = static_cast<U>(i);
+    ex::simd<T, SimdAbi> origin_simd;
+    origin_simd.copy_from(buffer, ex::vector_aligned_tag());
+    assert_simd_values_equal(origin_simd, buffer);
+  }
+};
+
+template <class T, class SimdAbi, std::size_t array_size>
+struct OveralignedCopyFromHelper {
+  template <class U>
+  void operator()() const {
+    alignas(bit_ceil(sizeof(U) + 1)) U buffer[array_size];
+    for (size_t i = 0; i < array_size; ++i)
+      buffer[i] = static_cast<U>(i);
+    ex::simd<T, SimdAbi> origin_simd;
+    origin_simd.copy_from(buffer, ex::overaligned_tag<bit_ceil(sizeof(U) + 1)>());
+    assert_simd_values_equal(origin_simd, buffer);
+  }
+};
+
+template <class T, std::size_t>
+struct CheckSimdCopyFrom {
+  template <class SimdAbi>
+  void operator()() {
+    constexpr std::size_t array_size = ex::simd_size_v<T, SimdAbi>;
+
+    types::for_each(simd_test_types(), ElementAlignedCopyFromHelper<T, SimdAbi, array_size>());
+    types::for_each(simd_test_types(), VectorAlignedCopyFromHelper<T, SimdAbi, array_size>());
+    types::for_each(simd_test_types(), OveralignedCopyFromHelper<T, SimdAbi, array_size>());
+  }
+};
+
+template <class T, class SimdAbi, std::size_t array_size>
+struct ElementAlignedCopyToHelper {
+  template <class U>
+  void operator()() const {
+    U buffer[array_size];
+    ex::simd<T, SimdAbi> origin_simd([](T i) { return i; });
+    origin_simd.copy_to(buffer, ex::element_aligned_tag());
+    assert_simd_values_equal(origin_simd, buffer);
+  }
+};
+
+template <class T, class SimdAbi, std::size_t array_size>
+struct VectorAlignedCopyToHelper {
+  template <class U>
+  void operator()() const {
+    alignas(ex::memory_alignment_v<ex::simd<T, SimdAbi>, U>) U buffer[array_size];
+    ex::simd<T, SimdAbi> origin_simd([](T i) { return i; });
+    origin_simd.copy_to(buffer, ex::vector_aligned_tag());
+    assert_simd_values_equal(origin_simd, buffer);
+  }
+};
+
+template <class T, class SimdAbi, std::size_t array_size>
+struct OveralignedCopyToHelper {
+  template <class U>
+  void operator()() const {
+    alignas(bit_ceil(sizeof(U) + 1)) U buffer[array_size];
+    ex::simd<T, SimdAbi> origin_simd([](T i) { return i; });
+    origin_simd.copy_to(buffer, ex::overaligned_tag<bit_ceil(sizeof(U) + 1)>());
+    assert_simd_values_equal(origin_simd, buffer);
+  }
+};
+
+template <class T, std::size_t>
+struct CheckSimdCopyTo {
+  template <class SimdAbi>
+  void operator()() {
+    constexpr std::size_t array_size = ex::simd_size_v<T, SimdAbi>;
+
+    types::for_each(simd_test_types(), ElementAlignedCopyToHelper<T, SimdAbi, array_size>());
+    types::for_each(simd_test_types(), VectorAlignedCopyToHelper<T, SimdAbi, array_size>());
+    types::for_each(simd_test_types(), OveralignedCopyToHelper<T, SimdAbi, array_size>());
+  }
+};
+
+template <class U, class T, class Flags, class SimdAbi = ex::simd_abi::compatible<T>, class = void>
+struct has_copy_from : std::false_type {};
+
+template <class U, class T, class Flags, class SimdAbi>
+struct has_copy_from<U,
+                     T,
+                     Flags,
+                     SimdAbi,
+                     std::void_t<decltype(std::declval<ex::simd<T, SimdAbi>>().copy_from(
+                         std::declval<const U*>(), std::declval<Flags>()))>> : std::true_type {};
+
+template <class U, class T, class Flags, class SimdAbi = ex::simd_abi::compatible<T>, class = void>
+struct has_copy_to : std::false_type {};
+
+template <class U, class T, class Flags, class SimdAbi>
+struct has_copy_to<
+    U,
+    T,
+    Flags,
+    SimdAbi,
+    std::void_t<decltype(std::declval<ex::simd<T, SimdAbi>>().copy_to(std::declval<U*>(), std::declval<Flags>()))>>
+    : std::true_type {};
+
+template <class T, std::size_t>
+struct CheckSimdCopyTraits {
+  template <class SimdAbi>
+  void operator()() {
+    // These functions shall not participate in overload resolution unless
+    // is_simd_flag_type_v<Flags> is true, and
+    // U is a vectorizable type.
+    static_assert(has_copy_from<int, T, ex::element_aligned_tag, SimdAbi>::value);
+    static_assert(has_copy_to<int, T, ex::element_aligned_tag, SimdAbi>::value);
+
+    // is_simd_flag_type_v<Flags> is false
+    static_assert(!has_copy_from<int, T, T, SimdAbi>::value);
+    static_assert(!has_copy_to<int, T, T, SimdAbi>::value);
+    static_assert(!has_copy_from<int, T, SimdAbi, SimdAbi>::value);
+    static_assert(!has_copy_to<int, T, SimdAbi, SimdAbi>::value);
+
+    // U is not a vectorizable type.
+    static_assert(!has_copy_from<SimdAbi, T, ex::element_aligned_tag, SimdAbi>::value);
+    static_assert(!has_copy_to<SimdAbi, T, ex::element_aligned_tag, SimdAbi>::value);
+    static_assert(!has_copy_from<ex::element_aligned_tag, T, ex::element_aligned_tag, SimdAbi>::value);
+    static_assert(!has_copy_to<ex::element_aligned_tag, T, ex::element_aligned_tag, SimdAbi>::value);
+  }
+};
+
+int main(int, char**) {
+  test_all_simd_abi<CheckSimdCopyFrom>();
+  test_all_simd_abi<CheckSimdCopyTo>();
+  test_all_simd_abi<CheckSimdCopyTraits>();
+  return 0;
+}
diff --git a/libcxx/test/std/experimental/simd/simd.mask.class/simd_mask_copy.pass.cpp b/libcxx/test/std/experimental/simd/simd.mask.class/simd_mask_copy.pass.cpp
new file mode 100644
index 000000000000..0c3b4c9ea6d5
--- /dev/null
+++ b/libcxx/test/std/experimental/simd/simd.mask.class/simd_mask_copy.pass.cpp
@@ -0,0 +1,127 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// <experimental/simd>
+//
+// [simd.class]
+// template<class Flags> void copy_from(const value_type* mem, Flags);
+// template<class Flags> void copy_to(value_type* mem, Flags);
+
+#include "../test_utils.h"
+
+namespace ex = std::experimental::parallelism_v2;
+
+template <class T, std::size_t>
+struct CheckSimdMaskCopyFrom {
+  template <class SimdAbi>
+  void operator()() {
+    constexpr std::size_t array_size = ex::simd_size_v<T, SimdAbi>;
+
+    // element aligned tag
+    constexpr std::size_t element_alignas_size = alignof(bool);
+    alignas(element_alignas_size) bool element_buffer[array_size];
+    for (size_t i = 0; i < array_size; ++i)
+      element_buffer[i] = static_cast<bool>(i % 2);
+    ex::simd_mask<T, SimdAbi> element_mask;
+    element_mask.copy_from(element_buffer, ex::element_aligned_tag());
+    assert_simd_mask_values_equal(element_mask, element_buffer);
+
+    // vector aligned tag
+    constexpr std::size_t vector_alignas_size = ex::memory_alignment_v<ex::simd_mask<T, SimdAbi>>;
+    alignas(vector_alignas_size) bool vector_buffer[array_size];
+    for (size_t i = 0; i < array_size; ++i)
+      vector_buffer[i] = static_cast<bool>(i % 2);
+    ex::simd_mask<T, SimdAbi> vector_mask;
+    vector_mask.copy_from(vector_buffer, ex::vector_aligned_tag());
+    assert_simd_mask_values_equal(vector_mask, vector_buffer);
+
+    // overaligned tag
+    constexpr std::size_t over_alignas_size = bit_ceil(sizeof(bool) + 1);
+    alignas(over_alignas_size) bool overaligned_buffer[array_size];
+    for (size_t i = 0; i < array_size; ++i)
+      overaligned_buffer[i] = static_cast<bool>(i % 2);
+    ex::simd_mask<T, SimdAbi> overaligned_mask;
+    overaligned_mask.copy_from(overaligned_buffer, ex::overaligned_tag<over_alignas_size>());
+    assert_simd_mask_values_equal(overaligned_mask, overaligned_buffer);
+  }
+};
+
+template <class T, std::size_t>
+struct CheckSimdMaskCopyTo {
+  template <class SimdAbi>
+  void operator()() {
+    constexpr std::size_t array_size = ex::simd_size_v<T, SimdAbi>;
+
+    // element aligned tag
+    constexpr std::size_t element_alignas_size = alignof(bool);
+    alignas(element_alignas_size) bool element_buffer[array_size];
+    ex::simd_mask<T, SimdAbi> element_mask(true);
+    element_mask.copy_to(element_buffer, ex::element_aligned_tag());
+    assert_simd_mask_values_equal(element_mask, element_buffer);
+
+    // vector aligned tag
+    constexpr std::size_t vector_alignas_size = ex::memory_alignment_v<ex::simd_mask<T, SimdAbi>>;
+    alignas(vector_alignas_size) bool vector_buffer[array_size];
+    ex::simd_mask<T, SimdAbi> vector_mask(false);
+    vector_mask.copy_to(vector_buffer, ex::vector_aligned_tag());
+    assert_simd_mask_values_equal(vector_mask, vector_buffer);
+
+    // overaligned tag
+    constexpr std::size_t over_alignas_size = bit_ceil(sizeof(bool) + 1);
+    alignas(over_alignas_size) bool overaligned_buffer[array_size];
+    ex::simd_mask<T, SimdAbi> overaligned_mask(true);
+    overaligned_mask.copy_to(overaligned_buffer, ex::overaligned_tag<over_alignas_size>());
+    assert_simd_mask_values_equal(overaligned_mask, overaligned_buffer);
+  }
+};
+
+template <class T, class Flags, class SimdAbi = ex::simd_abi::compatible<T>, class = void>
+struct has_copy_from : std::false_type {};
+
+template <class T, class Flags, class SimdAbi>
+struct has_copy_from<T,
+                     Flags,
+                     SimdAbi,
+                     std::void_t<decltype(std::declval<ex::simd_mask<T, SimdAbi>>().copy_from(
+                         std::declval<const bool*>(), std::declval<Flags>()))>> : std::true_type {};
+
+template <class T, class Flags, class SimdAbi = ex::simd_abi::compatible<T>, class = void>
+struct has_copy_to : std::false_type {};
+
+template <class T, class Flags, class SimdAbi>
+struct has_copy_to<T,
+                   Flags,
+                   SimdAbi,
+                   std::void_t<decltype(std::declval<ex::simd_mask<T, SimdAbi>>().copy_to(
+                       std::declval<bool*>(), std::declval<Flags>()))>> : std::true_type {};
+
+template <class T, std::size_t>
+struct CheckSimdMaskCopyTraits {
+  template <class SimdAbi>
+  void operator()() {
+    // These functions shall not participate in overload resolution unless
+    // is_simd_flag_type_v<Flags> is true
+    static_assert(has_copy_from<T, ex::element_aligned_tag, SimdAbi>::value);
+    static_assert(has_copy_to<T, ex::element_aligned_tag, SimdAbi>::value);
+
+    // is_simd_flag_type_v<Flags> is false
+    static_assert(!has_copy_from<T, T, SimdAbi>::value);
+    static_assert(!has_copy_to<T, T, SimdAbi>::value);
+    static_assert(!has_copy_from<T, SimdAbi, SimdAbi>::value);
+    static_assert(!has_copy_to<T, SimdAbi, SimdAbi>::value);
+  }
+};
+
+int main(int, char**) {
+  test_all_simd_abi<CheckSimdMaskCopyFrom>();
+  test_all_simd_abi<CheckSimdMaskCopyTo>();
+  test_all_simd_abi<CheckSimdMaskCopyTraits>();
+  return 0;
+}
diff --git a/libcxx/test/std/iterators/predef.iterators/counted.iterator/implicit_ctad.pass.cpp b/libcxx/test/std/iterators/predef.iterators/counted.iterator/implicit_ctad.pass.cpp
index 2786dfbb7a60..5b4853a783c2 100644
--- a/libcxx/test/std/iterators/predef.iterators/counted.iterator/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/iterators/predef.iterators/counted.iterator/implicit_ctad.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++98, c++03, c++11, c++14, c++17
+// UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // counted_iterator
 
diff --git a/libcxx/test/std/iterators/predef.iterators/insert.iterators/back.insert.iterator/implicit_ctad.pass.cpp b/libcxx/test/std/iterators/predef.iterators/insert.iterators/back.insert.iterator/implicit_ctad.pass.cpp
index 10729e0029d0..3c2e6af98d55 100644
--- a/libcxx/test/std/iterators/predef.iterators/insert.iterators/back.insert.iterator/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/iterators/predef.iterators/insert.iterators/back.insert.iterator/implicit_ctad.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <iterator>
 
diff --git a/libcxx/test/std/iterators/predef.iterators/insert.iterators/front.insert.iterator/implicit_ctad.pass.cpp b/libcxx/test/std/iterators/predef.iterators/insert.iterators/front.insert.iterator/implicit_ctad.pass.cpp
index f91d472e9ea2..f9b086aea4fc 100644
--- a/libcxx/test/std/iterators/predef.iterators/insert.iterators/front.insert.iterator/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/iterators/predef.iterators/insert.iterators/front.insert.iterator/implicit_ctad.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <iterator>
 
diff --git a/libcxx/test/std/iterators/predef.iterators/move.iterators/move.iterator/implicit_ctad.pass.cpp b/libcxx/test/std/iterators/predef.iterators/move.iterators/move.iterator/implicit_ctad.pass.cpp
index e5744465daa9..b84a07017dae 100644
--- a/libcxx/test/std/iterators/predef.iterators/move.iterators/move.iterator/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/iterators/predef.iterators/move.iterators/move.iterator/implicit_ctad.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <iterator>
 
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp
index 21663cdf956d..0241e7cefcac 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp
@@ -8,11 +8,11 @@
 
 // test sized operator delete[] replacement.
 
+// TODO(mordante) fix this test after updating clang in Docker
+// UNSUPPORTED: clang-15, clang-16, clang-17, clang-18, clang-19
 // UNSUPPORTED: sanitizer-new-delete, c++03, c++11
-
-// NOTE: Clang does not enable sized-deallocation in C++14 and beyond by
-// default. It is only enabled when -fsized-deallocation is given.
-// XFAIL: clang, apple-clang
+// XFAIL: apple-clang
+// XFAIL: using-built-library-before-llvm-11
 
 #include <new>
 #include <cstddef>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp
index a8701ce7a86c..2ab691618ea4 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp
@@ -8,11 +8,11 @@
 
 // test sized operator delete replacement.
 
+// TODO(mordante) fix this test after updating clang in Docker
+// UNSUPPORTED: clang-15, clang-16, clang-17, clang-18, clang-19
 // UNSUPPORTED: sanitizer-new-delete, c++03, c++11
-
-// NOTE: Clang does not enable sized-deallocation in C++14 and beyond by
-// default. It is only enabled when -fsized-deallocation is given.
-// XFAIL: clang, apple-clang
+// XFAIL: apple-clang
+// XFAIL: using-built-library-before-llvm-11
 
 #include <new>
 #include <cstddef>
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp
index c802ab787682..fbd1c7c5715e 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp
@@ -6,6 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+// The fix for LWG2381 (https://github.com/llvm/llvm-project/pull/77948) changed
+// behavior of FP parsing, while Apple back-deployment targets remain broken due
+// to the dylib.
+// UNSUPPORTED: using-built-library-before-llvm-19
+
 // <locale>
 
 // class num_get<charT, InputIterator>
@@ -116,9 +121,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == INFINITY);
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0);
     }
     {
         const char str[] = "INF";
@@ -128,9 +133,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == INFINITY);
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0);
     }
     {
         const char str[] = "-inf";
@@ -140,9 +145,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == -INFINITY);
+        assert(base(iter) == str + 1);
+        assert(err == ios.failbit);
+        assert(v == 0.0);
     }
     {
         const char str[] = "-INF";
@@ -152,9 +157,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == -INFINITY);
+        assert(base(iter) == str + 1);
+        assert(err == ios.failbit);
+        assert(v == 0.0);
     }
     {
         const char str[] = "nan";
@@ -164,9 +169,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(std::isnan(v));
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0);
     }
     {
         const char str[] = "NAN";
@@ -176,9 +181,129 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(std::isnan(v));
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0);
+    }
+    {
+      const char str[] = "p00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
+    }
+    {
+      const char str[] = "P00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
+    }
+    {
+      const char str[] = "+p00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
+    }
+    {
+      const char str[] = "+P00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
+    }
+    {
+      const char str[] = "-p00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
+    }
+    {
+      const char str[] = "-P00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
+    }
+    {
+      const char str[] = "e00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
+    }
+    {
+      const char str[] = "E00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
+    }
+    {
+      const char str[] = "+e00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
+    }
+    {
+      const char str[] = "+E00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
+    }
+    {
+      const char str[] = "-e00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
+    }
+    {
+      const char str[] = "-E00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0);
     }
     {
         v = -1;
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp
index 79c8480d0699..b5ac7d876157 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp
@@ -6,6 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+// The fix for LWG2381 (https://github.com/llvm/llvm-project/pull/77948) changed
+// behavior of FP parsing, while Apple back-deployment targets remain broken due
+// to the dylib.
+// UNSUPPORTED: using-built-library-before-llvm-19
+
 // <locale>
 
 // class num_get<charT, InputIterator>
@@ -105,9 +110,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == INFINITY);
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0f);
     }
     {
         const char str[] = "INF";
@@ -117,9 +122,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == INFINITY);
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0f);
     }
     {
         const char str[] = "-inf";
@@ -129,9 +134,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == -INFINITY);
+        assert(base(iter) == str + 1);
+        assert(err == ios.failbit);
+        assert(v == 0.0f);
     }
     {
         const char str[] = "-INF";
@@ -141,9 +146,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == -INFINITY);
+        assert(base(iter) == str + 1);
+        assert(err == ios.failbit);
+        assert(v == 0.0f);
     }
     {
         const char str[] = "nan";
@@ -153,9 +158,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(std::isnan(v));
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0f);
     }
     {
         const char str[] = "NAN";
@@ -165,9 +170,129 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(std::isnan(v));
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0f);
+    }
+    {
+      const char str[] = "p00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
+    }
+    {
+      const char str[] = "P00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
+    }
+    {
+      const char str[] = "+p00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
+    }
+    {
+      const char str[] = "+P00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
+    }
+    {
+      const char str[] = "-p00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
+    }
+    {
+      const char str[] = "-P00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
+    }
+    {
+      const char str[] = "e00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
+    }
+    {
+      const char str[] = "E00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
+    }
+    {
+      const char str[] = "+e00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
+    }
+    {
+      const char str[] = "+E00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
+    }
+    {
+      const char str[] = "-e00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
+    }
+    {
+      const char str[] = "-E00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0f);
     }
     {
         v = -1;
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp
index e2b2aeafd1ef..9617899f749c 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp
@@ -6,6 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+// The fix for LWG2381 (https://github.com/llvm/llvm-project/pull/77948) changed
+// behavior of FP parsing, while Apple back-deployment targets remain broken due
+// to the dylib.
+// UNSUPPORTED: using-built-library-before-llvm-19
+
 // <locale>
 
 // class num_get<charT, InputIterator>
@@ -105,9 +110,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == INFINITY);
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0l);
     }
     {
         const char str[] = "INF";
@@ -117,9 +122,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == INFINITY);
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0l);
     }
     {
         const char str[] = "-inf";
@@ -129,9 +134,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == -INFINITY);
+        assert(base(iter) == str + 1);
+        assert(err == ios.failbit);
+        assert(v == 0.0l);
     }
     {
         const char str[] = "-INF";
@@ -141,9 +146,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(v == -INFINITY);
+        assert(base(iter) == str + 1);
+        assert(err == ios.failbit);
+        assert(v == 0.0l);
     }
     {
         const char str[] = "nan";
@@ -153,9 +158,9 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(std::isnan(v));
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0l);
     }
     {
         const char str[] = "NAN";
@@ -165,9 +170,129 @@ int main(int, char**)
             f.get(cpp17_input_iterator<const char*>(str),
                   cpp17_input_iterator<const char*>(str+sizeof(str)),
                   ios, err, v);
-        assert(base(iter) == str+sizeof(str)-1);
-        assert(err == ios.goodbit);
-        assert(std::isnan(v));
+        assert(base(iter) == str);
+        assert(err == ios.failbit);
+        assert(v == 0.0l);
+    }
+    {
+      const char str[] = "p00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
+    }
+    {
+      const char str[] = "P00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
+    }
+    {
+      const char str[] = "+p00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
+    }
+    {
+      const char str[] = "+P00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
+    }
+    {
+      const char str[] = "-p00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
+    }
+    {
+      const char str[] = "-P00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
+    }
+    {
+      const char str[] = "e00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
+    }
+    {
+      const char str[] = "E00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
+    }
+    {
+      const char str[] = "+e00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
+    }
+    {
+      const char str[] = "+E00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
+    }
+    {
+      const char str[] = "-e00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
+    }
+    {
+      const char str[] = "-E00";
+      std::hex(ios);
+      std::ios_base::iostate err             = ios.goodbit;
+      cpp17_input_iterator<const char*> iter = f.get(
+          cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + sizeof(str)), ios, err, v);
+      assert(base(iter) == str + 1);
+      assert(err == ios.failbit);
+      assert(v == 0.0l);
     }
     {
         const char str[] = "1.189731495357231765021264e+49321";
diff --git a/libcxx/test/std/algorithms/numeric.ops/reduce/pstl.reduce.pass.cpp b/libcxx/test/std/numerics/numeric.ops/reduce/pstl.reduce.pass.cpp
index b083c4f80e0b..f5748d7c823b 100644
--- a/libcxx/test/std/algorithms/numeric.ops/reduce/pstl.reduce.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/reduce/pstl.reduce.pass.cpp
@@ -10,7 +10,7 @@
 
 // UNSUPPORTED: libcpp-has-no-incomplete-pstl
 
-// <algorithm>
+// <numeric>
 
 // template<class ExecutionPolicy, class ForwardIterator>
 //   typename iterator_traits<ForwardIterator>::value_type
diff --git a/libcxx/test/std/algorithms/numeric.ops/transform.reduce/pstl.transform_reduce.binary.pass.cpp b/libcxx/test/std/numerics/numeric.ops/transform.reduce/pstl.transform_reduce.binary.pass.cpp
index 18b56f237c3e..6d8bb47ac7dc 100644
--- a/libcxx/test/std/algorithms/numeric.ops/transform.reduce/pstl.transform_reduce.binary.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/transform.reduce/pstl.transform_reduce.binary.pass.cpp
@@ -10,7 +10,7 @@
 
 // UNSUPPORTED: libcpp-has-no-incomplete-pstl
 
-// <algorithm>
+// <numeric>
 
 // template<class ExecutionPolicy,
 //          class ForwardIterator1, class ForwardIterator2, class T>
diff --git a/libcxx/test/std/algorithms/numeric.ops/transform.reduce/pstl.transform_reduce.unary.pass.cpp b/libcxx/test/std/numerics/numeric.ops/transform.reduce/pstl.transform_reduce.unary.pass.cpp
index a32a4f85f633..4cea3d405aa0 100644
--- a/libcxx/test/std/algorithms/numeric.ops/transform.reduce/pstl.transform_reduce.unary.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/transform.reduce/pstl.transform_reduce.unary.pass.cpp
@@ -10,7 +10,7 @@
 
 // UNSUPPORTED: libcpp-has-no-incomplete-pstl
 
-// <algorithm>
+// <numeric>
 
 // template<class ExecutionPolicy,
 //          class ForwardIterator, class T,
diff --git a/libcxx/test/std/strings/string.view/string.view.deduct/implicit.pass.cpp b/libcxx/test/std/strings/string.view/string.view.deduct/implicit.pass.cpp
index c76c4a01c696..36584f76bebd 100644
--- a/libcxx/test/std/strings/string.view/string.view.deduct/implicit.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.deduct/implicit.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <string_view>
 
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_for.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_for.pass.cpp
index 42150207c3c4..6a054f74b9fb 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_for.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_for.pass.cpp
@@ -5,9 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
-// ALLOW_RETRIES: 2
+
+// UNSUPPORTED: no-threads, c++03
 
 // <condition_variable>
 
@@ -19,77 +18,92 @@
 //              const chrono::duration<Rep, Period>& rel_time);
 
 #include <condition_variable>
+#include <atomic>
+#include <cassert>
+#include <chrono>
 #include <mutex>
 #include <thread>
-#include <chrono>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-std::condition_variable cv;
-std::mutex mut;
-
-int test1 = 0;
-int test2 = 0;
-
-bool expect_timeout = false;
-
-void f()
-{
-    typedef std::chrono::system_clock Clock;
-    typedef std::chrono::milliseconds milliseconds;
-    std::unique_lock<std::mutex> lk(mut);
-    assert(test2 == 0);
-    test1 = 1;
-    cv.notify_one();
-    Clock::time_point t0 = Clock::now();
-    Clock::time_point wait_end = t0 + milliseconds(250);
-    Clock::duration d;
-    do {
-        d = wait_end - Clock::now();
-        if (d <= milliseconds(0)) break;
-    } while (test2 == 0 && cv.wait_for(lk, d) == std::cv_status::no_timeout);
-    Clock::time_point t1 = Clock::now();
-    if (!expect_timeout)
-    {
-        assert(t1 - t0 < milliseconds(250));
-        assert(test2 != 0);
-    }
-    else
-    {
-        assert(t1 - t0 - milliseconds(250) < milliseconds(50));
-        assert(test2 == 0);
-    }
+template <class Function>
+std::chrono::microseconds measure(Function f) {
+  std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
+  f();
+  std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 }
 
-int main(int, char**)
-{
-    {
-        std::unique_lock<std::mutex> lk(mut);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        test2 = 1;
-        lk.unlock();
-        cv.notify_one();
-        t.join();
-    }
-    test1 = 0;
-    test2 = 0;
-    expect_timeout = true;
-    {
-        std::unique_lock<std::mutex> lk(mut);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        lk.unlock();
-        t.join();
-    }
+int main(int, char**) {
+  // Test unblocking via a call to notify_one() in another thread.
+  //
+  // To test this, we set a very long timeout in wait_for() and we wait
+  // again in case we get awoken spuriously. Note that it can actually
+  // happen that we get awoken spuriously and fail to recognize it
+  // (making this test useless), but the likelihood should be small.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> likely_spurious(true);
+    auto timeout = std::chrono::seconds(3600);
+    std::condition_variable cv;
+    std::mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      auto elapsed = measure([&] {
+        ready = true;
+        do {
+          std::cv_status result = cv.wait_for(lock, timeout);
+          assert(result == std::cv_status::no_timeout);
+        } while (likely_spurious);
+      });
+
+      // This can technically fail if we have many spurious awakenings, but in practice the
+      // tolerance is so high that it shouldn't be a problem.
+      assert(elapsed < timeout);
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This blocks the condition variable inside its wait call
+      // so we can notify it while it is waiting.
+      std::unique_lock<std::mutex> lock(mutex);
+      cv.notify_one();
+      likely_spurious = false;
+      lock.unlock();
+    });
+
+    t2.join();
+    t1.join();
+  }
+
+  // Test unblocking via a timeout.
+  //
+  // To test this, we create a thread that waits on a condition variable
+  // with a certain timeout, and we never awaken it. To guard against
+  // spurious wakeups, we wait again whenever we are awoken for a reason
+  // other than a timeout.
+  {
+    auto timeout = std::chrono::milliseconds(250);
+    std::condition_variable cv;
+    std::mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      std::cv_status result;
+      do {
+        auto elapsed = measure([&] { result = cv.wait_for(lock, timeout); });
+        if (result == std::cv_status::timeout)
+          assert(elapsed >= timeout);
+      } while (result != std::cv_status::timeout);
+    });
+
+    t1.join();
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_for_pred.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_for_pred.pass.cpp
index 872bcb6d8a57..76fc7393bc8f 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_for_pred.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_for_pred.pass.cpp
@@ -5,9 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
-// ALLOW_RETRIES: 2
+
+// UNSUPPORTED: no-threads, c++03
 
 // <condition_variable>
 
@@ -20,82 +19,141 @@
 //              Predicate pred);
 
 #include <condition_variable>
+#include <atomic>
+#include <cassert>
+#include <chrono>
 #include <mutex>
 #include <thread>
-#include <chrono>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-class Pred
-{
-    int& i_;
-public:
-    explicit Pred(int& i) : i_(i) {}
-
-    bool operator()() {return i_ != 0;}
-};
-
-std::condition_variable cv;
-std::mutex mut;
-
-int test1 = 0;
-int test2 = 0;
-
-int runs = 0;
-
-void f()
-{
-    typedef std::chrono::system_clock Clock;
-    typedef std::chrono::milliseconds milliseconds;
-    std::unique_lock<std::mutex> lk(mut);
-    assert(test2 == 0);
-    test1 = 1;
-    cv.notify_one();
-    Clock::time_point t0 = Clock::now();
-    bool r = cv.wait_for(lk, milliseconds(250), Pred(test2));
-    ((void)r); // Prevent unused warning
-    Clock::time_point t1 = Clock::now();
-    if (runs == 0)
-    {
-        assert(t1 - t0 < milliseconds(250));
-        assert(test2 != 0);
-    }
-    else
-    {
-        assert(t1 - t0 - milliseconds(250) < milliseconds(50));
-        assert(test2 == 0);
-    }
-    ++runs;
+template <class Function>
+std::chrono::microseconds measure(Function f) {
+  std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
+  f();
+  std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 }
 
-int main(int, char**)
-{
-    {
-        std::unique_lock<std::mutex>lk(mut);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        test2 = 1;
-        lk.unlock();
-        cv.notify_one();
-        t.join();
-    }
-    test1 = 0;
-    test2 = 0;
-    {
-        std::unique_lock<std::mutex>lk(mut);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        lk.unlock();
-        t.join();
-    }
+int main(int, char**) {
+  // Test unblocking via a call to notify_one() in another thread.
+  //
+  // To test this, we set a very long timeout in wait_for() and we try to minimize
+  // the likelihood that we got awoken by a spurious wakeup by updating the
+  // likely_spurious flag only immediately before we perform the notification.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> likely_spurious(true);
+    auto timeout = std::chrono::seconds(3600);
+    std::condition_variable cv;
+    std::mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      auto elapsed = measure([&] {
+        ready       = true;
+        bool result = cv.wait_for(lock, timeout, [&] { return !likely_spurious; });
+        assert(result); // return value should be true since we didn't time out
+      });
+      assert(elapsed < timeout);
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      std::unique_lock<std::mutex> lock(mutex);
+
+      likely_spurious = false;
+      lock.unlock();
+      cv.notify_one();
+    });
+
+    t2.join();
+    t1.join();
+  }
+
+  // Test unblocking via a timeout.
+  //
+  // To test this, we create a thread that waits on a condition variable with a certain
+  // timeout, and we never awaken it. The "stop waiting" predicate always returns false,
+  // which means that we can't get out of the wait via a spurious wakeup.
+  {
+    auto timeout = std::chrono::milliseconds(250);
+    std::condition_variable cv;
+    std::mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      auto elapsed = measure([&] {
+        bool result = cv.wait_for(lock, timeout, [] { return false; }); // never stop waiting (until timeout)
+        assert(!result); // return value should be false since the predicate returns false after the timeout
+      });
+      assert(elapsed >= timeout);
+    });
+
+    t1.join();
+  }
+
+  // Test unblocking via a spurious wakeup.
+  //
+  // To test this, we set a fairly long timeout in wait_for() and we basically never
+  // wake up the condition variable. This way, we are hoping to get out of the wait
+  // via a spurious wakeup.
+  //
+  // However, since spurious wakeups are not required to even happen, this test is
+  // only trying to trigger that code path, but not actually asserting that it is
+  // taken. In particular, we do need to eventually ensure we get out of the wait
+  // by standard means, so we actually wake up the thread at the end.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> awoken(false);
+    auto timeout = std::chrono::seconds(3600);
+    std::condition_variable cv;
+    std::mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      auto elapsed = measure([&] {
+        ready       = true;
+        bool result = cv.wait_for(lock, timeout, [&] { return true; });
+        awoken      = true;
+        assert(result); // return value should be true since we didn't time out
+      });
+      assert(elapsed < timeout); // can technically fail if t2 never executes and we timeout, but very unlikely
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      std::unique_lock<std::mutex> lock(mutex);
+      lock.unlock();
+
+      // Give some time for t1 to be awoken spuriously so that code path is used.
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+
+      // We would want to assert that the thread has been awoken after this time,
+      // however nothing guarantees us that it ever gets spuriously awoken, so
+      // we can't really check anything. This is still left here as documentation.
+      bool woke = awoken.load();
+      assert(woke || !woke);
+
+      // Whatever happened, actually awaken the condition variable to ensure the test
+      // doesn't keep running until the timeout.
+      cv.notify_one();
+    });
+
+    t2.join();
+    t1.join();
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_pred.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_pred.pass.cpp
index 15feba55616b..5ce5bccb37f1 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_pred.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_pred.pass.cpp
@@ -5,9 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
-// ALLOW_RETRIES: 2
+
+// UNSUPPORTED: no-threads, c++03
 
 // <condition_variable>
 
@@ -17,51 +16,98 @@
 //   void wait(unique_lock<mutex>& lock, Predicate pred);
 
 #include <condition_variable>
+#include <atomic>
+#include <cassert>
 #include <mutex>
 #include <thread>
-#include <functional>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-std::condition_variable cv;
-std::mutex mut;
-
-int test1 = 0;
-int test2 = 0;
-
-class Pred
-{
-    int& i_;
-public:
-    explicit Pred(int& i) : i_(i) {}
-
-    bool operator()() {return i_ != 0;}
-};
-
-void f()
-{
-    std::unique_lock<std::mutex> lk(mut);
-    assert(test2 == 0);
-    test1 = 1;
-    cv.notify_one();
-    cv.wait(lk, Pred(test2));
-    assert(test2 != 0);
-}
+int main(int, char**) {
+  // Test unblocking via a call to notify_one() in another thread.
+  //
+  // To test this, we try to minimize the likelihood that we got awoken by a
+  // spurious wakeup by updating the likely_spurious flag only immediately
+  // before we perform the notification.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> likely_spurious(true);
+    std::condition_variable cv;
+    std::mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      ready = true;
+      cv.wait(lock, [&] { return !likely_spurious; });
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      std::unique_lock<std::mutex> lock(mutex);
+
+      likely_spurious = false;
+      lock.unlock();
+      cv.notify_one();
+    });
+
+    t2.join();
+    t1.join();
+  }
+
+  // Test unblocking via a spurious wakeup.
+  //
+  // To test this, we basically never wake up the condition variable. This way, we
+  // are hoping to get out of the wait via a spurious wakeup.
+  //
+  // However, since spurious wakeups are not required to even happen, this test is
+  // only trying to trigger that code path, but not actually asserting that it is
+  // taken. In particular, we do need to eventually ensure we get out of the wait
+  // by standard means, so we actually wake up the thread at the end.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> awoken(false);
+    std::condition_variable cv;
+    std::mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      ready = true;
+      cv.wait(lock, [&] { return true; });
+      awoken = true;
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      std::unique_lock<std::mutex> lock(mutex);
+      lock.unlock();
+
+      // Give some time for t1 to be awoken spuriously so that code path is used.
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+
+      // We would want to assert that the thread has been awoken after this time,
+      // however nothing guarantees us that it ever gets spuriously awoken, so
+      // we can't really check anything. This is still left here as documentation.
+      bool woke = awoken.load();
+      assert(woke || !woke);
+
+      // Whatever happened, actually awaken the condition variable to ensure the test finishes.
+      cv.notify_one();
+    });
 
-int main(int, char**)
-{
-    std::unique_lock<std::mutex>lk(mut);
-    std::thread t = support::make_test_thread(f);
-    assert(test1 == 0);
-    while (test1 == 0)
-        cv.wait(lk);
-    assert(test1 != 0);
-    test2 = 1;
-    lk.unlock();
-    cv.notify_one();
-    t.join();
+    t2.join();
+    t1.join();
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_until.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_until.pass.cpp
index 03205e68dca6..6f3a5a01cdd1 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_until.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_until.pass.cpp
@@ -5,9 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
-// ALLOW_RETRIES: 2
+
+// UNSUPPORTED: no-threads, c++03
 
 // <condition_variable>
 
@@ -19,100 +18,100 @@
 //              const chrono::time_point<Clock, Duration>& abs_time);
 
 #include <condition_variable>
+#include <atomic>
+#include <cassert>
+#include <chrono>
 #include <mutex>
 #include <thread>
-#include <chrono>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-struct TestClock
-{
-    typedef std::chrono::milliseconds duration;
-    typedef duration::rep             rep;
-    typedef duration::period          period;
-    typedef std::chrono::time_point<TestClock> time_point;
-    static const bool is_steady =  true;
-
-    static time_point now()
-    {
-        using namespace std::chrono;
-        return time_point(duration_cast<duration>(
-                steady_clock::now().time_since_epoch()
-                                                 ));
-    }
+struct TestClock {
+  typedef std::chrono::milliseconds duration;
+  typedef duration::rep rep;
+  typedef duration::period period;
+  typedef std::chrono::time_point<TestClock> time_point;
+  static const bool is_steady = true;
+
+  static time_point now() {
+    using namespace std::chrono;
+    return time_point(duration_cast<duration>(steady_clock::now().time_since_epoch()));
+  }
 };
 
-std::condition_variable cv;
-std::mutex mut;
-
-int test1 = 0;
-int test2 = 0;
-
-int runs = 0;
-
-template <typename Clock>
-void f()
-{
-    std::unique_lock<std::mutex> lk(mut);
-    assert(test2 == 0);
-    test1 = 1;
-    cv.notify_one();
-    typename Clock::time_point t0 = Clock::now();
-    typename Clock::time_point t = t0 + std::chrono::milliseconds(250);
-    while (test2 == 0 && cv.wait_until(lk, t) == std::cv_status::no_timeout)
-        ;
-    typename Clock::time_point t1 = Clock::now();
-    if (runs == 0)
-    {
-        assert(t1 - t0 < std::chrono::milliseconds(250));
-        assert(test2 != 0);
-    }
-    else
-    {
-        assert(t1 - t0 - std::chrono::milliseconds(250) < std::chrono::milliseconds(50));
-        assert(test2 == 0);
-    }
-    ++runs;
-}
+template <class Clock>
+void test() {
+  // Test unblocking via a call to notify_one() in another thread.
+  //
+  // To test this, we set a very long timeout in wait_until() and we wait
+  // again in case we get awoken spuriously. Note that it can actually
+  // happen that we get awoken spuriously and fail to recognize it
+  // (making this test useless), but the likelihood should be small.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> likely_spurious(true);
+    auto timeout = Clock::now() + std::chrono::seconds(3600);
+    std::condition_variable cv;
+    std::mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      ready = true;
+      do {
+        std::cv_status result = cv.wait_until(lock, timeout);
+        assert(result == std::cv_status::no_timeout);
+      } while (likely_spurious);
+
+      // This can technically fail if we have many spurious awakenings, but in practice the
+      // tolerance is so high that it shouldn't be a problem.
+      assert(Clock::now() < timeout);
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This blocks the condition variable inside its wait call
+      // so we can notify it while it is waiting.
+      std::unique_lock<std::mutex> lock(mutex);
+      cv.notify_one();
+      likely_spurious = false;
+      lock.unlock();
+    });
+
+    t2.join();
+    t1.join();
+  }
+
+  // Test unblocking via a timeout.
+  //
+  // To test this, we create a thread that waits on a condition variable
+  // with a certain timeout, and we never awaken it. To guard against
+  // spurious wakeups, we wait again whenever we are awoken for a reason
+  // other than a timeout.
+  {
+    auto timeout = Clock::now() + std::chrono::milliseconds(250);
+    std::condition_variable cv;
+    std::mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      std::cv_status result;
+      do {
+        result = cv.wait_until(lock, timeout);
+        if (result == std::cv_status::timeout)
+          assert(Clock::now() >= timeout);
+      } while (result != std::cv_status::timeout);
+    });
 
-template <typename Clock>
-void run_test()
-{
-    runs = 0;
-    test1 = 0;
-    test2 = 0;
-    {
-        std::unique_lock<std::mutex>lk(mut);
-        std::thread t = support::make_test_thread(f<Clock>);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        test2 = 1;
-        lk.unlock();
-        cv.notify_one();
-        t.join();
-    }
-    test1 = 0;
-    test2 = 0;
-    {
-        std::unique_lock<std::mutex>lk(mut);
-        std::thread t = support::make_test_thread(f<Clock>);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        lk.unlock();
-        t.join();
-    }
+    t1.join();
+  }
 }
 
-int main(int, char**)
-{
-    run_test<TestClock>();
-    run_test<std::chrono::steady_clock>();
-    run_test<std::chrono::system_clock>();
-    return 0;
+int main(int, char**) {
+  test<TestClock>();
+  test<std::chrono::steady_clock>();
+  return 0;
 }
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_until_pred.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_until_pred.pass.cpp
index fb8bd6e38069..847d0c10c572 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_until_pred.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvar/wait_until_pred.pass.cpp
@@ -6,8 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// UNSUPPORTED: no-threads
-// ALLOW_RETRIES: 2
+// UNSUPPORTED: no-threads, c++03
 
 // <condition_variable>
 
@@ -20,99 +19,145 @@
 //                Predicate pred);
 
 #include <condition_variable>
+#include <atomic>
+#include <cassert>
+#include <chrono>
 #include <mutex>
 #include <thread>
-#include <chrono>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-struct Clock
-{
-    typedef std::chrono::milliseconds duration;
-    typedef duration::rep             rep;
-    typedef duration::period          period;
-    typedef std::chrono::time_point<Clock> time_point;
-    static const bool is_steady =  true;
-
-    static time_point now()
-    {
-        using namespace std::chrono;
-        return time_point(duration_cast<duration>(
-                steady_clock::now().time_since_epoch()
-                                                 ));
-    }
+struct TestClock {
+  typedef std::chrono::milliseconds duration;
+  typedef duration::rep rep;
+  typedef duration::period period;
+  typedef std::chrono::time_point<TestClock> time_point;
+  static const bool is_steady = true;
+
+  static time_point now() {
+    using namespace std::chrono;
+    return time_point(duration_cast<duration>(steady_clock::now().time_since_epoch()));
+  }
 };
 
-class Pred
-{
-    int& i_;
-public:
-    explicit Pred(int& i) : i_(i) {}
+template <class Clock>
+void test() {
+  // Test unblocking via a call to notify_one() in another thread.
+  //
+  // To test this, we set a very long timeout in wait_until() and we try to minimize
+  // the likelihood that we got awoken by a spurious wakeup by updating the
+  // likely_spurious flag only immediately before we perform the notification.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> likely_spurious(true);
+    auto timeout = Clock::now() + std::chrono::seconds(3600);
+    std::condition_variable cv;
+    std::mutex mutex;
 
-    bool operator()() {return i_ != 0;}
-};
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      ready       = true;
+      bool result = cv.wait_until(lock, timeout, [&] { return !likely_spurious; });
+      assert(result); // return value should be true since we didn't time out
+      assert(Clock::now() < timeout);
+    });
 
-std::condition_variable cv;
-std::mutex mut;
-
-int test1 = 0;
-int test2 = 0;
-
-int runs = 0;
-
-void f()
-{
-    std::unique_lock<std::mutex> lk(mut);
-    assert(test2 == 0);
-    test1 = 1;
-    cv.notify_one();
-    Clock::time_point t0 = Clock::now();
-    Clock::time_point t = t0 + Clock::duration(250);
-    bool r = cv.wait_until(lk, t, Pred(test2));
-    Clock::time_point t1 = Clock::now();
-    if (runs == 0)
-    {
-        assert(t1 - t0 < Clock::duration(250));
-        assert(test2 != 0);
-        assert(r);
-    }
-    else
-    {
-        assert(t1 - t0 - Clock::duration(250) < Clock::duration(50));
-        assert(test2 == 0);
-        assert(!r);
-    }
-    ++runs;
-}
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      std::unique_lock<std::mutex> lock(mutex);
+
+      likely_spurious = false;
+      lock.unlock();
+      cv.notify_one();
+    });
+
+    t2.join();
+    t1.join();
+  }
+
+  // Test unblocking via a timeout.
+  //
+  // To test this, we create a thread that waits on a condition variable with a certain
+  // timeout, and we never awaken it. The "stop waiting" predicate always returns false,
+  // which means that we can't get out of the wait via a spurious wakeup.
+  {
+    auto timeout = Clock::now() + std::chrono::milliseconds(250);
+    std::condition_variable cv;
+    std::mutex mutex;
 
-int main(int, char**)
-{
-    {
-        std::unique_lock<std::mutex> lk(mut);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        test2 = 1;
-        lk.unlock();
-        cv.notify_one();
-        t.join();
-    }
-    test1 = 0;
-    test2 = 0;
-    {
-        std::unique_lock<std::mutex> lk(mut);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        lk.unlock();
-        t.join();
-    }
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      bool result = cv.wait_until(lock, timeout, [] { return false; }); // never stop waiting (until timeout)
+      assert(!result); // return value should be false since the predicate returns false after the timeout
+      assert(Clock::now() >= timeout);
+    });
+
+    t1.join();
+  }
+
+  // Test unblocking via a spurious wakeup.
+  //
+  // To test this, we set a fairly long timeout in wait_until() and we basically never
+  // wake up the condition variable. This way, we are hoping to get out of the wait
+  // via a spurious wakeup.
+  //
+  // However, since spurious wakeups are not required to even happen, this test is
+  // only trying to trigger that code path, but not actually asserting that it is
+  // taken. In particular, we do need to eventually ensure we get out of the wait
+  // by standard means, so we actually wake up the thread at the end.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> awoken(false);
+    auto timeout = Clock::now() + std::chrono::seconds(3600);
+    std::condition_variable cv;
+    std::mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      std::unique_lock<std::mutex> lock(mutex);
+      ready       = true;
+      bool result = cv.wait_until(lock, timeout, [&] { return true; });
+      awoken      = true;
+      assert(result);                 // return value should be true since we didn't time out
+      assert(Clock::now() < timeout); // can technically fail if t2 never executes and we timeout, but very unlikely
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      std::unique_lock<std::mutex> lock(mutex);
+      lock.unlock();
+
+      // Give some time for t1 to be awoken spuriously so that code path is used.
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+
+      // We would want to assert that the thread has been awoken after this time,
+      // however nothing guarantees us that it ever gets spuriously awoken, so
+      // we can't really check anything. This is still left here as documentation.
+      bool woke = awoken.load();
+      assert(woke || !woke);
+
+      // Whatever happened, actually awaken the condition variable to ensure the test
+      // doesn't keep running until the timeout.
+      cv.notify_one();
+    });
+
+    t2.join();
+    t1.join();
+  }
+}
 
+int main(int, char**) {
+  test<TestClock>();
+  test<std::chrono::steady_clock>();
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_for.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_for.pass.cpp
index 95acef90470e..eab38081d7b7 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_for.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_for.pass.cpp
@@ -6,8 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// UNSUPPORTED: no-threads
-// ALLOW_RETRIES: 2
+// UNSUPPORTED: no-threads, c++03
 
 // <condition_variable>
 
@@ -18,81 +17,105 @@
 //   wait_for(Lock& lock, const chrono::duration<Rep, Period>& rel_time);
 
 #include <condition_variable>
+#include <atomic>
+#include <cassert>
+#include <chrono>
 #include <mutex>
 #include <thread>
-#include <chrono>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-std::condition_variable_any cv;
-
-typedef std::timed_mutex L0;
-typedef std::unique_lock<L0> L1;
-
-L0 m0;
-
-int test1 = 0;
-int test2 = 0;
-
-bool expect_timeout = false;
-
-void f()
-{
-    typedef std::chrono::system_clock Clock;
-    typedef std::chrono::milliseconds milliseconds;
-    L1 lk(m0);
-    assert(test2 == 0);
-    test1 = 1;
-    cv.notify_one();
-    Clock::time_point t0 = Clock::now();
-    Clock::time_point wait_end = t0 + milliseconds(250);
-    Clock::duration d;
-    do {
-        d = wait_end - Clock::now();
-        if (d <= milliseconds(0)) break;
-    } while (test2 == 0 && cv.wait_for(lk, d) == std::cv_status::no_timeout);
-    Clock::time_point t1 = Clock::now();
-    if (!expect_timeout)
-    {
-        assert(t1 - t0 < milliseconds(250));
-        assert(test2 != 0);
-    }
-    else
-    {
-        assert(t1 - t0 - milliseconds(250) < milliseconds(50));
-        assert(test2 == 0);
-    }
+template <class Mutex>
+struct MyLock : std::unique_lock<Mutex> {
+  using std::unique_lock<Mutex>::unique_lock;
+};
+
+template <class Function>
+std::chrono::microseconds measure(Function f) {
+  std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
+  f();
+  std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 }
 
-int main(int, char**)
-{
-    {
-        L1 lk(m0);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        test2 = 1;
-        lk.unlock();
-        cv.notify_one();
-        t.join();
-    }
-    test1 = 0;
-    test2 = 0;
-    expect_timeout = true;
-    {
-        L1 lk(m0);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        lk.unlock();
-        t.join();
-    }
+template <class Lock>
+void test() {
+  using Mutex = typename Lock::mutex_type;
+  // Test unblocking via a call to notify_one() in another thread.
+  //
+  // To test this, we set a very long timeout in wait_for() and we wait
+  // again in case we get awoken spuriously. Note that it can actually
+  // happen that we get awoken spuriously and fail to recognize it
+  // (making this test useless), but the likelihood should be small.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> likely_spurious(true);
+    auto timeout = std::chrono::seconds(3600);
+    std::condition_variable_any cv;
+    Mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      auto elapsed = measure([&] {
+        ready = true;
+        do {
+          std::cv_status result = cv.wait_for(lock, timeout);
+          assert(result == std::cv_status::no_timeout);
+        } while (likely_spurious);
+      });
+
+      // This can technically fail if we have many spurious awakenings, but in practice the
+      // tolerance is so high that it shouldn't be a problem.
+      assert(elapsed < timeout);
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This blocks the condition variable inside its wait call
+      // so we can notify it while it is waiting.
+      Lock lock(mutex);
+      cv.notify_one();
+      likely_spurious = false;
+      lock.unlock();
+    });
+
+    t2.join();
+    t1.join();
+  }
+
+  // Test unblocking via a timeout.
+  //
+  // To test this, we create a thread that waits on a condition variable
+  // with a certain timeout, and we never awaken it. To guard against
+  // spurious wakeups, we wait again whenever we are awoken for a reason
+  // other than a timeout.
+  {
+    auto timeout = std::chrono::milliseconds(250);
+    std::condition_variable_any cv;
+    Mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      std::cv_status result;
+      do {
+        auto elapsed = measure([&] { result = cv.wait_for(lock, timeout); });
+        if (result == std::cv_status::timeout)
+          assert(elapsed >= timeout);
+      } while (result != std::cv_status::timeout);
+    });
+
+    t1.join();
+  }
+}
 
+int main(int, char**) {
+  test<std::unique_lock<std::mutex>>();
+  test<std::unique_lock<std::timed_mutex>>();
+  test<MyLock<std::mutex>>();
+  test<MyLock<std::timed_mutex>>();
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_for_pred.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_for_pred.pass.cpp
index 0b560022bc67..2dc36938b41e 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_for_pred.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_for_pred.pass.cpp
@@ -6,8 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// UNSUPPORTED: no-threads
-// ALLOW_RETRIES: 2
+// UNSUPPORTED: no-threads, c++03
 
 // <condition_variable>
 
@@ -19,89 +18,148 @@
 //            Predicate pred);
 
 #include <condition_variable>
+#include <atomic>
+#include <cassert>
+#include <chrono>
 #include <mutex>
 #include <thread>
-#include <chrono>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-class Pred
-{
-    int& i_;
-public:
-    explicit Pred(int& i) : i_(i) {}
-
-    bool operator()() {return i_ != 0;}
+template <class Mutex>
+struct MyLock : std::unique_lock<Mutex> {
+  using std::unique_lock<Mutex>::unique_lock;
 };
 
-std::condition_variable_any cv;
-
-typedef std::timed_mutex L0;
-typedef std::unique_lock<L0> L1;
-
-L0 m0;
-
-int test1 = 0;
-int test2 = 0;
-
-int runs = 0;
-bool expect_result = false;
-
-void f()
-{
-    typedef std::chrono::system_clock Clock;
-    typedef std::chrono::milliseconds milliseconds;
-    L1 lk(m0);
-    assert(test2 == 0);
-    test1 = 1;
-    cv.notify_one();
-    Clock::time_point t0 = Clock::now();
-    bool result = cv.wait_for(lk, milliseconds(250), Pred(test2));
-    assert(result == expect_result);
-    Clock::time_point t1 = Clock::now();
-    if (runs == 0)
-    {
-        assert(t1 - t0 < milliseconds(250));
-        assert(test2 != 0);
-    }
-    else
-    {
-        assert(t1 - t0 - milliseconds(250) < milliseconds(50));
-        assert(test2 == 0);
-    }
-    ++runs;
+template <class Function>
+std::chrono::microseconds measure(Function f) {
+  std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
+  f();
+  std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration_cast<std::chrono::microseconds>(end - start);
 }
 
-int main(int, char**)
-{
-    {
-        expect_result = true;
-        L1 lk(m0);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        test2 = 1;
-        lk.unlock();
-        cv.notify_one();
-        t.join();
-    }
-    test1 = 0;
-    test2 = 0;
-    {
-        expect_result = false;
-        L1 lk(m0);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        lk.unlock();
-        t.join();
-    }
-
-  return 0;
+template <class Lock>
+void test() {
+  using Mutex = typename Lock::mutex_type;
+  // Test unblocking via a call to notify_one() in another thread.
+  //
+  // To test this, we set a very long timeout in wait_for() and we try to minimize
+  // the likelihood that we got awoken by a spurious wakeup by updating the
+  // likely_spurious flag only immediately before we perform the notification.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> likely_spurious(true);
+    auto timeout = std::chrono::seconds(3600);
+    std::condition_variable_any cv;
+    Mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      auto elapsed = measure([&] {
+        ready       = true;
+        bool result = cv.wait_for(lock, timeout, [&] { return !likely_spurious; });
+        assert(result); // return value should be true since we didn't time out
+      });
+      assert(elapsed < timeout);
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      Lock lock(mutex);
+
+      likely_spurious = false;
+      lock.unlock();
+      cv.notify_one();
+    });
+
+    t2.join();
+    t1.join();
+  }
+
+  // Test unblocking via a timeout.
+  //
+  // To test this, we create a thread that waits on a condition variable with a certain
+  // timeout, and we never awaken it. The "stop waiting" predicate always returns false,
+  // which means that we can't get out of the wait via a spurious wakeup.
+  {
+    auto timeout = std::chrono::milliseconds(250);
+    std::condition_variable_any cv;
+    Mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      auto elapsed = measure([&] {
+        bool result = cv.wait_for(lock, timeout, [] { return false; }); // never stop waiting (until timeout)
+        assert(!result); // return value should be false since the predicate returns false after the timeout
+      });
+      assert(elapsed >= timeout);
+    });
+
+    t1.join();
+  }
+
+  // Test unblocking via a spurious wakeup.
+  //
+  // To test this, we set a fairly long timeout in wait_for() and we basically never
+  // wake up the condition variable. This way, we are hoping to get out of the wait
+  // via a spurious wakeup.
+  //
+  // However, since spurious wakeups are not required to even happen, this test is
+  // only trying to trigger that code path, but not actually asserting that it is
+  // taken. In particular, we do need to eventually ensure we get out of the wait
+  // by standard means, so we actually wake up the thread at the end.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> awoken(false);
+    auto timeout = std::chrono::seconds(3600);
+    std::condition_variable_any cv;
+    Mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      auto elapsed = measure([&] {
+        ready       = true;
+        bool result = cv.wait_for(lock, timeout, [&] { return true; });
+        awoken      = true;
+        assert(result); // return value should be true since we didn't time out
+      });
+      assert(elapsed < timeout); // can technically fail if t2 never executes and we timeout, but very unlikely
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      Lock lock(mutex);
+      lock.unlock();
+
+      // Give some time for t1 to be awoken spuriously so that code path is used.
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+
+      // We would want to assert that the thread has been awoken after this time,
+      // however nothing guarantees us that it ever gets spuriously awoken, so
+      // we can't really check anything. This is still left here as documentation.
+      bool woke = awoken.load();
+      assert(woke || !woke);
+
+      // Whatever happened, actually awaken the condition variable to ensure the test
+      // doesn't keep running until the timeout.
+      cv.notify_one();
+    });
+
+    t2.join();
+    t1.join();
+  }
 }
+
+int main(int, char**) { return 0; }
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_pred.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_pred.pass.cpp
index a5e28137bef8..48efbf12e738 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_pred.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_pred.pass.cpp
@@ -5,9 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// UNSUPPORTED: no-threads
-// ALLOW_RETRIES: 2
+
+// UNSUPPORTED: no-threads, c++03
 
 // <condition_variable>
 
@@ -17,55 +16,113 @@
 //   void wait(Lock& lock, Predicate pred);
 
 #include <condition_variable>
+#include <atomic>
+#include <cassert>
 #include <mutex>
 #include <thread>
-#include <functional>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-std::condition_variable_any cv;
+template <class Mutex>
+struct MyLock : std::unique_lock<Mutex> {
+  using std::unique_lock<Mutex>::unique_lock;
+};
 
-typedef std::timed_mutex L0;
-typedef std::unique_lock<L0> L1;
+template <class Lock>
+void test() {
+  using Mutex = typename Lock::mutex_type;
 
-L0 m0;
+  // Test unblocking via a call to notify_one() in another thread.
+  //
+  // To test this, we try to minimize the likelihood that we got awoken by a
+  // spurious wakeup by updating the likely_spurious flag only immediately
+  // before we perform the notification.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> likely_spurious(true);
+    std::condition_variable_any cv;
+    Mutex mutex;
 
-int test1 = 0;
-int test2 = 0;
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      ready = true;
+      cv.wait(lock, [&] { return !likely_spurious; });
+    });
 
-class Pred
-{
-    int& i_;
-public:
-    explicit Pred(int& i) : i_(i) {}
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
 
-    bool operator()() {return i_ != 0;}
-};
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      Lock lock(mutex);
+
+      likely_spurious = false;
+      lock.unlock();
+      cv.notify_one();
+    });
+
+    t2.join();
+    t1.join();
+  }
+
+  // Test unblocking via a spurious wakeup.
+  //
+  // To test this, we basically never wake up the condition variable. This way, we
+  // are hoping to get out of the wait via a spurious wakeup.
+  //
+  // However, since spurious wakeups are not required to even happen, this test is
+  // only trying to trigger that code path, but not actually asserting that it is
+  // taken. In particular, we do need to eventually ensure we get out of the wait
+  // by standard means, so we actually wake up the thread at the end.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> awoken(false);
+    std::condition_variable_any cv;
+    Mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      ready = true;
+      cv.wait(lock, [&] { return true; });
+      awoken = true;
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      Lock lock(mutex);
+      lock.unlock();
+
+      // Give some time for t1 to be awoken spuriously so that code path is used.
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+
+      // We would want to assert that the thread has been awoken after this time,
+      // however nothing guarantees us that it ever gets spuriously awoken, so
+      // we can't really check anything. This is still left here as documentation.
+      bool woke = awoken.load();
+      assert(woke || !woke);
+
+      // Whatever happened, actually awaken the condition variable to ensure the test finishes.
+      cv.notify_one();
+    });
 
-void f()
-{
-    L1 lk(m0);
-    assert(test2 == 0);
-    test1 = 1;
-    cv.notify_one();
-    cv.wait(lk, Pred(test2));
-    assert(test2 != 0);
+    t2.join();
+    t1.join();
+  }
 }
 
-int main(int, char**)
-{
-    L1 lk(m0);
-    std::thread t = support::make_test_thread(f);
-    assert(test1 == 0);
-    while (test1 == 0)
-        cv.wait(lk);
-    assert(test1 != 0);
-    test2 = 1;
-    lk.unlock();
-    cv.notify_one();
-    t.join();
+int main(int, char**) {
+  test<std::unique_lock<std::mutex>>();
+  test<std::unique_lock<std::timed_mutex>>();
+  test<MyLock<std::mutex>>();
+  test<MyLock<std::timed_mutex>>();
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_until.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_until.pass.cpp
index 0f2334393d83..6494bcd6dbe3 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_until.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_until.pass.cpp
@@ -6,8 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// UNSUPPORTED: no-threads
-// ALLOW_RETRIES: 2
+// UNSUPPORTED: no-threads, c++03
 
 // <condition_variable>
 
@@ -18,93 +17,115 @@
 //   wait_until(Lock& lock, const chrono::time_point<Clock, Duration>& abs_time);
 
 #include <condition_variable>
+#include <atomic>
+#include <cassert>
+#include <chrono>
 #include <mutex>
 #include <thread>
-#include <chrono>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-struct Clock
-{
-    typedef std::chrono::milliseconds duration;
-    typedef duration::rep             rep;
-    typedef duration::period          period;
-    typedef std::chrono::time_point<Clock> time_point;
-    static const bool is_steady =  true;
-
-    static time_point now()
-    {
-        using namespace std::chrono;
-        return time_point(duration_cast<duration>(
-                steady_clock::now().time_since_epoch()
-                                                 ));
-    }
+struct TestClock {
+  typedef std::chrono::milliseconds duration;
+  typedef duration::rep rep;
+  typedef duration::period period;
+  typedef std::chrono::time_point<TestClock> time_point;
+  static const bool is_steady = true;
+
+  static time_point now() {
+    using namespace std::chrono;
+    return time_point(duration_cast<duration>(steady_clock::now().time_since_epoch()));
+  }
 };
 
-std::condition_variable_any cv;
-
-typedef std::timed_mutex L0;
-typedef std::unique_lock<L0> L1;
-
-L0 m0;
-
-int test1 = 0;
-int test2 = 0;
-
-int runs = 0;
-
-void f()
-{
-    L1 lk(m0);
-    assert(test2 == 0);
-    test1 = 1;
-    cv.notify_one();
-    Clock::time_point t0 = Clock::now();
-    Clock::time_point t = t0 + Clock::duration(250);
-    while (test2 == 0 && cv.wait_until(lk, t) == std::cv_status::no_timeout)
-        ;
-    Clock::time_point t1 = Clock::now();
-    if (runs == 0)
-    {
-        assert(t1 - t0 < Clock::duration(250));
-        assert(test2 != 0);
-    }
-    else
-    {
-        assert(t1 - t0 - Clock::duration(250) < Clock::duration(50));
-        assert(test2 == 0);
-    }
-    ++runs;
+template <class Mutex>
+struct MyLock : std::unique_lock<Mutex> {
+  using std::unique_lock<Mutex>::unique_lock;
+};
+
+template <class Lock, class Clock>
+void test() {
+  using Mutex = typename Lock::mutex_type;
+  // Test unblocking via a call to notify_one() in another thread.
+  //
+  // To test this, we set a very long timeout in wait_until() and we wait
+  // again in case we get awoken spuriously. Note that it can actually
+  // happen that we get awoken spuriously and fail to recognize it
+  // (making this test useless), but the likelihood should be small.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> likely_spurious(true);
+    auto timeout = Clock::now() + std::chrono::seconds(3600);
+    std::condition_variable_any cv;
+    Mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      ready = true;
+      do {
+        std::cv_status result = cv.wait_until(lock, timeout);
+        assert(result == std::cv_status::no_timeout);
+      } while (likely_spurious);
+
+      // This can technically fail if we have many spurious awakenings, but in practice the
+      // tolerance is so high that it shouldn't be a problem.
+      assert(Clock::now() < timeout);
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This blocks the condition variable inside its wait call
+      // so we can notify it while it is waiting.
+      Lock lock(mutex);
+      cv.notify_one();
+      likely_spurious = false;
+      lock.unlock();
+    });
+
+    t2.join();
+    t1.join();
+  }
+
+  // Test unblocking via a timeout.
+  //
+  // To test this, we create a thread that waits on a condition variable
+  // with a certain timeout, and we never awaken it. To guard against
+  // spurious wakeups, we wait again whenever we are awoken for a reason
+  // other than a timeout.
+  {
+    auto timeout = Clock::now() + std::chrono::milliseconds(250);
+    std::condition_variable_any cv;
+    Mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      std::cv_status result;
+      do {
+        result = cv.wait_until(lock, timeout);
+        if (result == std::cv_status::timeout)
+          assert(Clock::now() >= timeout);
+      } while (result != std::cv_status::timeout);
+    });
+
+    t1.join();
+  }
 }
 
-int main(int, char**)
-{
-    {
-        L1 lk(m0);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        test2 = 1;
-        lk.unlock();
-        cv.notify_one();
-        t.join();
-    }
-    test1 = 0;
-    test2 = 0;
-    {
-        L1 lk(m0);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        lk.unlock();
-        t.join();
-    }
+int main(int, char**) {
+  test<std::unique_lock<std::mutex>, TestClock>();
+  test<std::unique_lock<std::mutex>, std::chrono::steady_clock>();
+
+  test<std::unique_lock<std::timed_mutex>, TestClock>();
+  test<std::unique_lock<std::timed_mutex>, std::chrono::steady_clock>();
+
+  test<MyLock<std::mutex>, TestClock>();
+  test<MyLock<std::mutex>, std::chrono::steady_clock>();
 
+  test<MyLock<std::timed_mutex>, TestClock>();
+  test<MyLock<std::timed_mutex>, std::chrono::steady_clock>();
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_until_pred.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_until_pred.pass.cpp
index aa60ae4715df..ee7c1729aacf 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_until_pred.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_until_pred.pass.cpp
@@ -6,8 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// UNSUPPORTED: no-threads
-// ALLOW_RETRIES: 2
+// UNSUPPORTED: no-threads, c++03
 
 // <condition_variable>
 
@@ -20,103 +19,171 @@
 //                Predicate pred);
 
 #include <condition_variable>
+#include <atomic>
+#include <cassert>
+#include <chrono>
 #include <mutex>
 #include <thread>
-#include <chrono>
-#include <cassert>
 
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-struct Clock
-{
-    typedef std::chrono::milliseconds duration;
-    typedef duration::rep             rep;
-    typedef duration::period          period;
-    typedef std::chrono::time_point<Clock> time_point;
-    static const bool is_steady =  true;
-
-    static time_point now()
-    {
-        using namespace std::chrono;
-        return time_point(duration_cast<duration>(
-                steady_clock::now().time_since_epoch()
-                                                 ));
-    }
+struct TestClock {
+  typedef std::chrono::milliseconds duration;
+  typedef duration::rep rep;
+  typedef duration::period period;
+  typedef std::chrono::time_point<TestClock> time_point;
+  static const bool is_steady = true;
+
+  static time_point now() {
+    using namespace std::chrono;
+    return time_point(duration_cast<duration>(steady_clock::now().time_since_epoch()));
+  }
 };
 
-class Pred
-{
-    int& i_;
-public:
-    explicit Pred(int& i) : i_(i) {}
-
-    bool operator()() {return i_ != 0;}
+template <class Mutex>
+struct MyLock : std::unique_lock<Mutex> {
+  using std::unique_lock<Mutex>::unique_lock;
 };
 
-std::condition_variable_any cv;
-
-typedef std::timed_mutex L0;
-typedef std::unique_lock<L0> L1;
-
-L0 m0;
-
-int test1 = 0;
-int test2 = 0;
-
-int runs = 0;
-
-void f()
-{
-    L1 lk(m0);
-    assert(test2 == 0);
-    test1 = 1;
-    cv.notify_one();
-    Clock::time_point t0 = Clock::now();
-    Clock::time_point t = t0 + Clock::duration(250);
-    bool r = cv.wait_until(lk, t, Pred(test2));
-    Clock::time_point t1 = Clock::now();
-    if (runs == 0)
-    {
-        assert(t1 - t0 < Clock::duration(250));
-        assert(test2 != 0);
-        assert(r);
-    }
-    else
-    {
-        assert(t1 - t0 - Clock::duration(250) < Clock::duration(50));
-        assert(test2 == 0);
-        assert(!r);
-    }
-    ++runs;
+template <class Lock, class Clock>
+void test() {
+  using Mutex = typename Lock::mutex_type;
+  // Test unblocking via a call to notify_one() in another thread.
+  //
+  // To test this, we set a very long timeout in wait_until() and we try to minimize
+  // the likelihood that we got awoken by a spurious wakeup by updating the
+  // likely_spurious flag only immediately before we perform the notification.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> likely_spurious(true);
+    auto timeout = Clock::now() + std::chrono::seconds(3600);
+    std::condition_variable_any cv;
+    Mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      ready       = true;
+      bool result = cv.wait_until(lock, timeout, [&] { return !likely_spurious; });
+      assert(result); // return value should be true since we didn't time out
+      assert(Clock::now() < timeout);
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      Lock lock(mutex);
+
+      likely_spurious = false;
+      lock.unlock();
+      cv.notify_one();
+    });
+
+    t2.join();
+    t1.join();
+  }
+
+  // Test unblocking via a timeout.
+  //
+  // To test this, we create a thread that waits on a condition variable with a certain
+  // timeout, and we never awaken it. The "stop waiting" predicate always returns false,
+  // which means that we can't get out of the wait via a spurious wakeup.
+  {
+    auto timeout = Clock::now() + std::chrono::milliseconds(250);
+    std::condition_variable_any cv;
+    Mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      bool result = cv.wait_until(lock, timeout, [] { return false; }); // never stop waiting (until timeout)
+      assert(!result); // return value should be false since the predicate returns false after the timeout
+      assert(Clock::now() >= timeout);
+    });
+
+    t1.join();
+  }
+
+  // Test unblocking via a spurious wakeup.
+  //
+  // To test this, we set a fairly long timeout in wait_until() and we basically never
+  // wake up the condition variable. This way, we are hoping to get out of the wait
+  // via a spurious wakeup.
+  //
+  // However, since spurious wakeups are not required to even happen, this test is
+  // only trying to trigger that code path, but not actually asserting that it is
+  // taken. In particular, we do need to eventually ensure we get out of the wait
+  // by standard means, so we actually wake up the thread at the end.
+  {
+    std::atomic<bool> ready(false);
+    std::atomic<bool> awoken(false);
+    auto timeout = Clock::now() + std::chrono::seconds(3600);
+    std::condition_variable_any cv;
+    Mutex mutex;
+
+    std::thread t1 = support::make_test_thread([&] {
+      Lock lock(mutex);
+      ready       = true;
+      bool result = cv.wait_until(lock, timeout, [&] { return true; });
+      awoken      = true;
+      assert(result);                 // return value should be true since we didn't time out
+      assert(Clock::now() < timeout); // can technically fail if t2 never executes and we timeout, but very unlikely
+    });
+
+    std::thread t2 = support::make_test_thread([&] {
+      while (!ready) {
+        // spin
+      }
+
+      // Acquire the same mutex as t1. This ensures that the condition variable has started
+      // waiting (and hence released that mutex).
+      Lock lock(mutex);
+      lock.unlock();
+
+      // Give some time for t1 to be awoken spuriously so that code path is used.
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+
+      // We would want to assert that the thread has been awoken after this time,
+      // however nothing guarantees us that it ever gets spuriously awoken, so
+      // we can't really check anything. This is still left here as documentation.
+      bool woke = awoken.load();
+      assert(woke || !woke);
+
+      // Whatever happened, actually awaken the condition variable to ensure the test
+      // doesn't keep running until the timeout.
+      cv.notify_one();
+    });
+
+    t2.join();
+    t1.join();
+  }
 }
 
-int main(int, char**)
-{
-    {
-        L1 lk(m0);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        test2 = 1;
-        lk.unlock();
-        cv.notify_one();
-        t.join();
-    }
-    test1 = 0;
-    test2 = 0;
-    {
-        L1 lk(m0);
-        std::thread t = support::make_test_thread(f);
-        assert(test1 == 0);
-        while (test1 == 0)
-            cv.wait(lk);
-        assert(test1 != 0);
-        lk.unlock();
-        t.join();
-    }
+int main(int, char**) {
+  // Run on multiple threads to speed up the test, and because it ought to work anyways.
+  std::thread tests[] = {
+      support::make_test_thread([] {
+        test<std::unique_lock<std::mutex>, TestClock>();
+        test<std::unique_lock<std::mutex>, std::chrono::steady_clock>();
+      }),
+      support::make_test_thread([] {
+        test<std::unique_lock<std::timed_mutex>, TestClock>();
+        test<std::unique_lock<std::timed_mutex>, std::chrono::steady_clock>();
+      }),
+      support::make_test_thread([] {
+        test<MyLock<std::mutex>, TestClock>();
+        test<MyLock<std::mutex>, std::chrono::steady_clock>();
+      }),
+      support::make_test_thread([] {
+        test<MyLock<std::timed_mutex>, TestClock>();
+        test<MyLock<std::timed_mutex>, std::chrono::steady_clock>();
+      })};
+
+  for (std::thread& t : tests)
+    t.join();
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/implicit_ctad.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/implicit_ctad.pass.cpp
index b75441733482..9319ec0dba04 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/implicit_ctad.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <mutex>
 
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/implicit_ctad.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/implicit_ctad.pass.cpp
index 7305b48c53a9..86bda3a9c6b9 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/implicit_ctad.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <mutex>
 
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/implicit_ctad.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/implicit_ctad.pass.cpp
index 9a595f90ed4f..826ec2b558f0 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/implicit_ctad.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <shared_mutex>
 
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex.pass.cpp
index 4940041bcf96..ece330134f2c 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex.pass.cpp
@@ -5,10 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
-// ALLOW_RETRIES: 2
 
 // <shared_mutex>
 
@@ -19,9 +18,8 @@
 // template<class _Mutex> shared_lock(shared_lock<_Mutex>)
 //     -> shared_lock<_Mutex>;  // C++17
 
+#include <atomic>
 #include <cassert>
-#include <chrono>
-#include <cstdlib>
 #include <shared_mutex>
 #include <thread>
 #include <vector>
@@ -29,77 +27,77 @@
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-typedef std::chrono::system_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
-
-ms WaitTime = ms(250);
-
-// Thread sanitizer causes more overhead and will sometimes cause this test
-// to fail. To prevent this we give Thread sanitizer more time to complete the
-// test.
-#if !defined(TEST_IS_EXECUTED_IN_A_SLOW_ENVIRONMENT)
-ms Tolerance = ms(50);
-#else
-ms Tolerance = ms(50 * 5);
-#endif
+struct Monitor {
+  bool lock_shared_called   = false;
+  bool unlock_shared_called = false;
+};
 
-std::shared_timed_mutex m;
+struct TrackedMutex {
+  Monitor* monitor = nullptr;
 
-void f()
-{
-    time_point t0 = Clock::now();
-    time_point t1;
-    {
-    std::shared_lock<std::shared_timed_mutex> ul(m);
-    t1 = Clock::now();
-    }
-    ns d = t1 - t0 - WaitTime;
-    assert(d < Tolerance);  // within tolerance
-}
+  void lock_shared() {
+    if (monitor != nullptr)
+      monitor->lock_shared_called = true;
+  }
+  void unlock_shared() {
+    if (monitor != nullptr)
+      monitor->unlock_shared_called = true;
+  }
+};
 
-void g()
-{
-    time_point t0 = Clock::now();
-    time_point t1;
-    {
-    std::shared_lock<std::shared_timed_mutex> ul(m);
-    t1 = Clock::now();
-    }
-    ns d = t1 - t0;
-    assert(d < Tolerance);  // within tolerance
-}
+template <class Mutex>
+void test() {
+  // Basic sanity test
+  {
+    Mutex mutex;
+    std::vector<std::thread> threads;
+    std::atomic<bool> ready(false);
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        while (!ready) {
+          // spin
+        }
 
-int main(int, char**)
-{
-    std::vector<std::thread> v;
-    {
-        m.lock();
-        for (int i = 0; i < 5; ++i)
-            v.push_back(support::make_test_thread(f));
-        std::this_thread::sleep_for(WaitTime);
-        m.unlock();
-        for (auto& t : v)
-            t.join();
-    }
-    {
-        m.lock_shared();
-        for (auto& t : v)
-            t = support::make_test_thread(g);
-        std::thread q = support::make_test_thread(f);
-        std::this_thread::sleep_for(WaitTime);
-        m.unlock_shared();
-        for (auto& t : v)
-            t.join();
-        q.join();
+        std::shared_lock<Mutex> lock(mutex);
+        assert(lock.owns_lock());
+      }));
     }
 
+    ready = true;
+    for (auto& t : threads)
+      t.join();
+  }
+
+  // Test CTAD
+  {
+#if TEST_STD_VER >= 17
+    Mutex mutex;
+    std::shared_lock lock(mutex);
+    static_assert(std::is_same<decltype(lock), std::shared_lock<Mutex>>::value);
+#endif
+  }
+}
+
+int main(int, char**) {
 #if TEST_STD_VER >= 17
-    std::shared_lock sl(m);
-    static_assert((std::is_same<decltype(sl), std::shared_lock<decltype(m)>>::value), "" );
+  test<std::shared_mutex>();
 #endif
+  test<std::shared_timed_mutex>();
+  test<TrackedMutex>();
+
+  // Use shared_lock with a dummy mutex class that tracks whether each
+  // operation has been called or not.
+  {
+    Monitor monitor;
+    TrackedMutex mutex{&monitor};
+
+    std::shared_lock<TrackedMutex> lock(mutex);
+    assert(monitor.lock_shared_called);
+    assert(lock.owns_lock());
+
+    lock.unlock();
+    assert(monitor.unlock_shared_called);
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/lock.pass.cpp
index edb7c42356ac..d36ca1d38f8f 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/lock.pass.cpp
@@ -5,10 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
-// ALLOW_RETRIES: 2
 
 // <shared_mutex>
 
@@ -16,10 +15,9 @@
 
 // void lock();
 
+#include <atomic>
 #include <cassert>
-#include <chrono>
-#include <cstdlib>
-#include <mutex>
+#include <mutex> // std::defer_lock
 #include <shared_mutex>
 #include <system_error>
 #include <thread>
@@ -28,71 +26,99 @@
 #include "make_test_thread.h"
 #include "test_macros.h"
 
-std::shared_timed_mutex m;
+struct Monitor {
+  bool lock_shared_called   = false;
+  bool unlock_shared_called = false;
+};
 
-typedef std::chrono::system_clock Clock;
-typedef Clock::time_point time_point;
-typedef Clock::duration duration;
-typedef std::chrono::milliseconds ms;
-typedef std::chrono::nanoseconds ns;
+struct TrackedMutex {
+  Monitor* monitor = nullptr;
 
-ms WaitTime = ms(250);
+  void lock_shared() {
+    if (monitor != nullptr)
+      monitor->lock_shared_called = true;
+  }
+  void unlock_shared() {
+    if (monitor != nullptr)
+      monitor->unlock_shared_called = true;
+  }
+};
 
-// Thread sanitizer causes more overhead and will sometimes cause this test
-// to fail. To prevent this we give Thread sanitizer more time to complete the
-// test.
-#if !defined(TEST_IS_EXECUTED_IN_A_SLOW_ENVIRONMENT)
-ms Tolerance = ms(25);
-#else
-ms Tolerance = ms(25 * 5);
-#endif
+template <class Mutex>
+void test() {
+  // Basic sanity test
+  {
+    Mutex mutex;
+    std::vector<std::thread> threads;
+    std::atomic<bool> ready(false);
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        while (!ready) {
+          // spin
+        }
 
+        std::shared_lock<Mutex> lock(mutex, std::defer_lock);
+        lock.lock();
+        assert(lock.owns_lock());
+      }));
+    }
+
+    ready = true;
+    for (auto& t : threads)
+      t.join();
+  }
 
-void f()
-{
-    std::shared_lock<std::shared_timed_mutex> lk(m, std::defer_lock);
-    time_point t0 = Clock::now();
-    lk.lock();
-    time_point t1 = Clock::now();
-    assert(lk.owns_lock() == true);
-    ns d = t1 - t0 - WaitTime;
-    assert(d < Tolerance);  // within tolerance
+  // Try locking the same shared_lock again in the same thread. This should throw an exception.
+  {
+    Mutex mutex;
+    std::shared_lock<Mutex> lock(mutex, std::defer_lock);
+    lock.lock();
+    assert(lock.owns_lock());
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        lk.lock();
-        assert(false);
-    }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EDEADLK);
+    try {
+      lock.lock();
+      assert(false);
+    } catch (std::system_error const& e) {
+      assert(e.code() == std::errc::resource_deadlock_would_occur);
     }
 #endif
-    lk.unlock();
-    lk.release();
+  }
+
+  // Try locking a shared_lock that isn't associated to any mutex. This should throw an exception.
+  {
+    std::shared_lock<Mutex> lock; // no associated mutex
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        lk.lock();
-        assert(false);
-    }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EPERM);
+    try {
+      lock.lock();
+      assert(false);
+    } catch (std::system_error const& e) {
+      assert(e.code() == std::errc::operation_not_permitted);
     }
 #endif
+  }
 }
 
-int main(int, char**)
-{
-    m.lock();
-    std::vector<std::thread> v;
-    for (int i = 0; i < 5; ++i)
-        v.push_back(support::make_test_thread(f));
-    std::this_thread::sleep_for(WaitTime);
-    m.unlock();
-    for (auto& t : v)
-        t.join();
+int main(int, char**) {
+#if TEST_STD_VER >= 17
+  test<std::shared_mutex>();
+#endif
+  test<std::shared_timed_mutex>();
+  test<TrackedMutex>();
+
+  // Use shared_lock with a dummy mutex class that tracks whether each
+  // operation has been called or not.
+  {
+    Monitor monitor;
+    TrackedMutex mutex{&monitor};
+
+    std::shared_lock<TrackedMutex> lock(mutex, std::defer_lock);
+    lock.lock();
+    assert(monitor.lock_shared_called);
+    assert(lock.owns_lock());
+
+    lock.unlock();
+    assert(monitor.unlock_shared_called);
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/try_lock.pass.cpp
index 0e707fcf2d50..b6146680b6e3 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/try_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/try_lock.pass.cpp
@@ -5,11 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
+
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
-//
-// ALLOW_RETRIES: 2
 
 // <shared_mutex>
 
@@ -17,60 +15,115 @@
 
 // bool try_lock();
 
+#include <atomic>
 #include <cassert>
-#include <mutex>
+#include <mutex> // std::defer_lock
 #include <shared_mutex>
 #include <system_error>
+#include <thread>
+#include <vector>
 
+#include "make_test_thread.h"
 #include "test_macros.h"
 
-bool try_lock_called = false;
+struct Monitor {
+  bool try_lock_shared_called = false;
+  bool unlock_shared_called   = false;
+};
 
-struct mutex
-{
-    bool try_lock_shared()
-    {
-        try_lock_called = !try_lock_called;
-        return try_lock_called;
-    }
-    void unlock_shared() {}
+struct TrackedMutex {
+  Monitor* monitor = nullptr;
+
+  bool try_lock_shared() {
+    if (monitor != nullptr)
+      monitor->try_lock_shared_called = true;
+    return true;
+  }
+  void unlock_shared() {
+    if (monitor != nullptr)
+      monitor->unlock_shared_called = true;
+  }
 };
 
-mutex m;
+template <class Mutex>
+void test() {
+  // Basic sanity test
+  {
+    Mutex mutex;
+    std::vector<std::thread> threads;
+    std::atomic<bool> ready(false);
+    for (int i = 0; i != 5; ++i) {
+      threads.push_back(support::make_test_thread([&] {
+        while (!ready) {
+          // spin
+        }
 
-int main(int, char**)
-{
-    std::shared_lock<mutex> lk(m, std::defer_lock);
-    assert(lk.try_lock() == true);
-    assert(try_lock_called == true);
-    assert(lk.owns_lock() == true);
-#ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        TEST_IGNORE_NODISCARD lk.try_lock();
-        assert(false);
+        std::shared_lock<Mutex> lock(mutex, std::defer_lock);
+        bool result = lock.try_lock();
+        assert(result);
+        assert(lock.owns_lock());
+      }));
     }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EDEADLK);
+
+    ready = true;
+    for (auto& t : threads)
+      t.join();
+  }
+
+  // Make sure that we throw an exception if we try to re-lock a mutex that is
+  // already locked by the current thread.
+  {
+    Mutex mutex;
+
+    std::shared_lock<Mutex> lock(mutex, std::defer_lock);
+    assert(lock.try_lock());
+    assert(lock.owns_lock());
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    try {
+      TEST_IGNORE_NODISCARD lock.try_lock();
+      assert(false);
+    } catch (std::system_error const& e) {
+      assert(e.code() == std::errc::resource_deadlock_would_occur);
     }
 #endif
-    lk.unlock();
-    assert(lk.try_lock() == false);
-    assert(try_lock_called == false);
-    assert(lk.owns_lock() == false);
-    lk.release();
+  }
+
+  // Make sure that we throw an exception if we try to lock a shared_lock
+  // that is not associated to any mutex.
+  {
+    std::shared_lock<Mutex> lock; // not associated to a mutex
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    try
-    {
-        TEST_IGNORE_NODISCARD lk.try_lock();
-        assert(false);
-    }
-    catch (std::system_error& e)
-    {
-        assert(e.code().value() == EPERM);
+    try {
+      TEST_IGNORE_NODISCARD lock.try_lock();
+      assert(false);
+    } catch (std::system_error const& e) {
+      assert(e.code() == std::errc::operation_not_permitted);
     }
 #endif
+  }
+}
+
+int main(int, char**) {
+#if TEST_STD_VER >= 17
+  test<std::shared_mutex>();
+#endif
+  test<std::shared_timed_mutex>();
+  test<TrackedMutex>();
+
+  // Use shared_lock with a dummy mutex class that tracks whether each
+  // operation has been called or not.
+  {
+    Monitor monitor;
+    TrackedMutex mutex{&monitor};
+
+    std::shared_lock<TrackedMutex> lock(mutex, std::defer_lock);
+    bool result = lock.try_lock();
+    assert(result);
+    assert(monitor.try_lock_shared_called);
+    assert(lock.owns_lock());
 
+    lock.unlock();
+    assert(monitor.unlock_shared_called);
+  }
   return 0;
 }
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp
index ffe651c6b744..337ad4c45a94 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/implicit_ctad.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <mutex>
 
diff --git a/libcxx/test/std/utilities/function.objects/func.search/func.search.bm/implicit_ctad.pass.cpp b/libcxx/test/std/utilities/function.objects/func.search/func.search.bm/implicit_ctad.pass.cpp
index 863b4a5c2569..50c89d6b8db6 100644
--- a/libcxx/test/std/utilities/function.objects/func.search/func.search.bm/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/func.search/func.search.bm/implicit_ctad.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <functional>
 
diff --git a/libcxx/test/std/utilities/function.objects/func.search/func.search.bmh/implicit_ctad.pass.cpp b/libcxx/test/std/utilities/function.objects/func.search/func.search.bmh/implicit_ctad.pass.cpp
index 778f6d3bd2cb..9cb4ef5afbb5 100644
--- a/libcxx/test/std/utilities/function.objects/func.search/func.search.bmh/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/func.search/func.search.bmh/implicit_ctad.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <functional>
 
diff --git a/libcxx/test/std/utilities/function.objects/func.search/func.search.default/implicit_ctad.pass.cpp b/libcxx/test/std/utilities/function.objects/func.search/func.search.default/implicit_ctad.pass.cpp
index 3c9029566d92..6334ed16ed52 100644
--- a/libcxx/test/std/utilities/function.objects/func.search/func.search.default/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/func.search/func.search.default/implicit_ctad.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <functional>
 
diff --git a/libcxx/test/std/utilities/function.objects/operations.implicit_ctad.pass.cpp b/libcxx/test/std/utilities/function.objects/operations.implicit_ctad.pass.cpp
index 03c46d232c38..bb4fb4bf71c1 100644
--- a/libcxx/test/std/utilities/function.objects/operations.implicit_ctad.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/operations.implicit_ctad.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++98, c++03, c++11, c++14
+// UNSUPPORTED: c++03, c++11, c++14
 
 // <functional>
 
diff --git a/libcxx/test/std/utilities/utility/mem.res/mem.res.global/new_delete_resource.pass.cpp b/libcxx/test/std/utilities/utility/mem.res/mem.res.global/new_delete_resource.pass.cpp
index 68a82f6ce90b..7b3107029d4d 100644
--- a/libcxx/test/std/utilities/utility/mem.res/mem.res.global/new_delete_resource.pass.cpp
+++ b/libcxx/test/std/utilities/utility/mem.res/mem.res.global/new_delete_resource.pass.cpp
@@ -76,7 +76,7 @@ void test_allocate_deallocate() {
   ASSERT_WITH_LIBRARY_INTERNAL_ALLOCATIONS(globalMemCounter.checkOutstandingNewEq(1));
   ASSERT_WITH_LIBRARY_INTERNAL_ALLOCATIONS(globalMemCounter.checkLastNewSizeEq(50));
 
-  r1.deallocate(ret, 1);
+  r1.deallocate(ret, 50);
   assert(globalMemCounter.checkOutstandingNewEq(0));
   ASSERT_WITH_LIBRARY_INTERNAL_ALLOCATIONS(globalMemCounter.checkDeleteCalledEq(1));
 }
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index c81b56b1af54..093cd39ea64c 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -38,6 +38,39 @@ def _getAndroidDeviceApi(cfg):
         )
     )
 
+
+def _mingwSupportsModules(cfg):
+    # Only mingw headers are known to work with libc++ built as a module,
+    # at the moment.
+    if not "__MINGW32__" in compilerMacros(cfg):
+        return False
+    # For mingw headers, check for a version known to support being built
+    # as a module.
+    return sourceBuilds(
+        cfg,
+        """
+        #include <_mingw_mac.h>
+        #if __MINGW64_VERSION_MAJOR < 12
+        #error Headers known to be incompatible
+        #elif __MINGW64_VERSION_MAJOR == 12
+        // The headers were fixed to work with libc++ modules during
+        // __MINGW64_VERSION_MAJOR == 12. The headers became compatible
+        // with libc++ built as a module in
+        // 1652e9241b5d8a5a779c6582b1c3c4f4a7cc66e5 (Apr 2024), but the
+        // following commit 8c13b28ace68f2c0094d45121d59a4b951b533ed
+        // removed the now unused __mingw_static_ovr define. Use this
+        // as indicator for whether we've got new enough headers.
+        #ifdef __mingw_static_ovr
+        #error Headers too old
+        #endif
+        #else
+        // __MINGW64_VERSION_MAJOR > 12 should be ok.
+        #endif
+        int main() { return 0; }
+        """,
+    )
+
+
 # Lit features are evaluated in order. Some checks may require the compiler detection to have
 # run first in order to work properly.
 DEFAULT_FEATURES = [
@@ -281,7 +314,7 @@ DEFAULT_FEATURES = [
         #  Any declaration of a library function shall have external linkage.
         when=lambda cfg: "__ANDROID__" in compilerMacros(cfg)
         or "__FreeBSD__" in compilerMacros(cfg)
-        or "_WIN32" in compilerMacros(cfg)
+        or ("_WIN32" in compilerMacros(cfg) and not _mingwSupportsModules(cfg))
         or platform.system().lower().startswith("aix")
         # Avoid building on platforms that don't support modules properly.
         or not hasCompileFlag(cfg, "-Wno-reserved-module-identifier"),
diff --git a/libcxxabi/include/cxxabi.h b/libcxxabi/include/cxxabi.h
index d0701181751c..0e3969084e04 100644
--- a/libcxxabi/include/cxxabi.h
+++ b/libcxxabi/include/cxxabi.h
@@ -48,13 +48,17 @@ extern _LIBCXXABI_FUNC_VIS void
 __cxa_free_exception(void *thrown_exception) throw();
 // This function is an LLVM extension, which mirrors the same extension in libsupc++ and libcxxrt
 extern _LIBCXXABI_FUNC_VIS __cxa_exception*
+#ifdef __wasm__
+// In Wasm, a destructor returns its argument
+__cxa_init_primary_exception(void* object, std::type_info* tinfo, void*(_LIBCXXABI_DTOR_FUNC* dest)(void*)) throw();
+#else
 __cxa_init_primary_exception(void* object, std::type_info* tinfo, void(_LIBCXXABI_DTOR_FUNC* dest)(void*)) throw();
+#endif
 
 // 2.4.3 Throwing the Exception Object
 extern _LIBCXXABI_FUNC_VIS _LIBCXXABI_NORETURN void
 __cxa_throw(void *thrown_exception, std::type_info *tinfo,
-#ifdef __USING_WASM_EXCEPTIONS__
-            // In Wasm, a destructor returns its argument
+#ifdef __wasm__
             void *(_LIBCXXABI_DTOR_FUNC *dest)(void *));
 #else
             void (_LIBCXXABI_DTOR_FUNC *dest)(void *));
diff --git a/libcxxabi/src/cxa_exception.cpp b/libcxxabi/src/cxa_exception.cpp
index 65e9f4504dda..ff69a4c65e46 100644
--- a/libcxxabi/src/cxa_exception.cpp
+++ b/libcxxabi/src/cxa_exception.cpp
@@ -207,7 +207,12 @@ void __cxa_free_exception(void *thrown_object) throw() {
 }
 
 __cxa_exception* __cxa_init_primary_exception(void* object, std::type_info* tinfo,
+#ifdef __wasm__
+// In Wasm, a destructor returns its argument
+                                              void *(_LIBCXXABI_DTOR_FUNC* dest)(void*)) throw() {
+#else
                                               void(_LIBCXXABI_DTOR_FUNC* dest)(void*)) throw() {
+#endif
   __cxa_exception* exception_header = cxa_exception_from_thrown_object(object);
   exception_header->referenceCount = 0;
   exception_header->unexpectedHandler = std::get_unexpected();
@@ -267,7 +272,7 @@ will call terminate, assuming that there was no handler for the
 exception.
 */
 void
-#ifdef __USING_WASM_EXCEPTIONS__
+#ifdef __wasm__
 // In Wasm, a destructor returns its argument
 __cxa_throw(void *thrown_object, std::type_info *tinfo, void *(_LIBCXXABI_DTOR_FUNC *dest)(void *)) {
 #else
diff --git a/libcxxabi/src/cxa_exception.h b/libcxxabi/src/cxa_exception.h
index 10712f6f47bb..aba08f299210 100644
--- a/libcxxabi/src/cxa_exception.h
+++ b/libcxxabi/src/cxa_exception.h
@@ -43,7 +43,7 @@ struct _LIBCXXABI_HIDDEN __cxa_exception {
 
     //  Manage the exception object itself.
     std::type_info *exceptionType;
-#ifdef __USING_WASM_EXCEPTIONS__
+#ifdef __wasm__
     // In Wasm, a destructor returns its argument
     void *(_LIBCXXABI_DTOR_FUNC *exceptionDestructor)(void *);
 #else
diff --git a/libcxxabi/src/cxa_personality.cpp b/libcxxabi/src/cxa_personality.cpp
index d95d78131940..843a18a4cbd8 100644
--- a/libcxxabi/src/cxa_personality.cpp
+++ b/libcxxabi/src/cxa_personality.cpp
@@ -70,7 +70,7 @@ extern "C" EXCEPTION_DISPOSITION _GCC_specific_handler(PEXCEPTION_RECORD,
 +------------------+--+-----+-----+------------------------+--------------------------+
 | callSiteTableLength | (ULEB128) | Call Site Table length, used to find Action table |
 +---------------------+-----------+---------------------------------------------------+
-#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__USING_WASM_EXCEPTIONS__)
+#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__WASM_EXCEPTIONS__)
 +---------------------+-----------+------------------------------------------------+
 | Beginning of Call Site Table            The current ip lies within the           |
 | ...                                     (start, length) range of one of these    |
@@ -84,7 +84,7 @@ extern "C" EXCEPTION_DISPOSITION _GCC_specific_handler(PEXCEPTION_RECORD,
 | +-------------+---------------------------------+------------------------------+ |
 | ...                                                                              |
 +----------------------------------------------------------------------------------+
-#else  // __USING_SJLJ_EXCEPTIONS__ || __USING_WASM_EXCEPTIONS__
+#else  // __USING_SJLJ_EXCEPTIONS__ || __WASM_EXCEPTIONS__
 +---------------------+-----------+------------------------------------------------+
 | Beginning of Call Site Table            The current ip is a 1-based index into   |
 | ...                                     this table.  Or it is -1 meaning no      |
@@ -97,7 +97,7 @@ extern "C" EXCEPTION_DISPOSITION _GCC_specific_handler(PEXCEPTION_RECORD,
 | +-------------+---------------------------------+------------------------------+ |
 | ...                                                                              |
 +----------------------------------------------------------------------------------+
-#endif // __USING_SJLJ_EXCEPTIONS__ || __USING_WASM_EXCEPTIONS__
+#endif // __USING_SJLJ_EXCEPTIONS__ || __WASM_EXCEPTIONS__
 +---------------------------------------------------------------------+
 | Beginning of Action Table       ttypeIndex == 0 : cleanup           |
 | ...                             ttypeIndex  > 0 : catch             |
@@ -547,7 +547,7 @@ void
 set_registers(_Unwind_Exception* unwind_exception, _Unwind_Context* context,
               const scan_results& results)
 {
-#if defined(__USING_SJLJ_EXCEPTIONS__) || defined(__USING_WASM_EXCEPTIONS__)
+#if defined(__USING_SJLJ_EXCEPTIONS__) || defined(__WASM_EXCEPTIONS__)
 #define __builtin_eh_return_data_regno(regno) regno
 #elif defined(__ibmxl__)
 // IBM xlclang++ compiler does not support __builtin_eh_return_data_regno.
@@ -642,7 +642,7 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
     // Get beginning current frame's code (as defined by the
     // emitted dwarf code)
     uintptr_t funcStart = _Unwind_GetRegionStart(context);
-#if defined(__USING_SJLJ_EXCEPTIONS__) || defined(__USING_WASM_EXCEPTIONS__)
+#if defined(__USING_SJLJ_EXCEPTIONS__) || defined(__WASM_EXCEPTIONS__)
     if (ip == uintptr_t(-1))
     {
         // no action
@@ -652,9 +652,9 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
     else if (ip == 0)
         call_terminate(native_exception, unwind_exception);
     // ip is 1-based index into call site table
-#else  // !__USING_SJLJ_EXCEPTIONS__ && !__USING_WASM_EXCEPTIONS__
+#else  // !__USING_SJLJ_EXCEPTIONS__ && !__WASM_EXCEPTIONS__
     uintptr_t ipOffset = ip - funcStart;
-#endif // !__USING_SJLJ_EXCEPTIONS__ && !__USING_WASM_EXCEPTIONS__
+#endif // !__USING_SJLJ_EXCEPTIONS__ && !__WASM_EXCEPTIONS__
     const uint8_t* classInfo = NULL;
     // Note: See JITDwarfEmitter::EmitExceptionTable(...) for corresponding
     //       dwarf emission
@@ -675,7 +675,7 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
     // Walk call-site table looking for range that
     // includes current PC.
     uint8_t callSiteEncoding = *lsda++;
-#if defined(__USING_SJLJ_EXCEPTIONS__) || defined(__USING_WASM_EXCEPTIONS__)
+#if defined(__USING_SJLJ_EXCEPTIONS__) || defined(__WASM_EXCEPTIONS__)
     (void)callSiteEncoding;  // When using SjLj/Wasm exceptions, callSiteEncoding is never used
 #endif
     uint32_t callSiteTableLength = static_cast<uint32_t>(readULEB128(&lsda));
@@ -686,7 +686,7 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
     while (callSitePtr < callSiteTableEnd)
     {
         // There is one entry per call site.
-#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__USING_WASM_EXCEPTIONS__)
+#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__WASM_EXCEPTIONS__)
         // The call sites are non-overlapping in [start, start+length)
         // The call sites are ordered in increasing value of start
         uintptr_t start = readEncodedPointer(&callSitePtr, callSiteEncoding);
@@ -694,15 +694,15 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
         uintptr_t landingPad = readEncodedPointer(&callSitePtr, callSiteEncoding);
         uintptr_t actionEntry = readULEB128(&callSitePtr);
         if ((start <= ipOffset) && (ipOffset < (start + length)))
-#else  // __USING_SJLJ_EXCEPTIONS__ || __USING_WASM_EXCEPTIONS__
+#else  // __USING_SJLJ_EXCEPTIONS__ || __WASM_EXCEPTIONS__
         // ip is 1-based index into this table
         uintptr_t landingPad = readULEB128(&callSitePtr);
         uintptr_t actionEntry = readULEB128(&callSitePtr);
         if (--ip == 0)
-#endif // __USING_SJLJ_EXCEPTIONS__ || __USING_WASM_EXCEPTIONS__
+#endif // __USING_SJLJ_EXCEPTIONS__ || __WASM_EXCEPTIONS__
         {
             // Found the call site containing ip.
-#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__USING_WASM_EXCEPTIONS__)
+#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__WASM_EXCEPTIONS__)
             if (landingPad == 0)
             {
                 // No handler here
@@ -710,9 +710,9 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
                 return;
             }
             landingPad = (uintptr_t)lpStart + landingPad;
-#else  // __USING_SJLJ_EXCEPTIONS__ || __USING_WASM_EXCEPTIONS__
+#else  // __USING_SJLJ_EXCEPTIONS__ || __WASM_EXCEPTIONS__
             ++landingPad;
-#endif // __USING_SJLJ_EXCEPTIONS__ || __USING_WASM_EXCEPTIONS__
+#endif // __USING_SJLJ_EXCEPTIONS__ || __WASM_EXCEPTIONS__
             results.landingPad = landingPad;
             if (actionEntry == 0)
             {
@@ -838,7 +838,7 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
                 action += actionOffset;
             }  // there is no break out of this loop, only return
         }
-#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__USING_WASM_EXCEPTIONS__)
+#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__WASM_EXCEPTIONS__)
         else if (ipOffset < start)
         {
             // There is no call site for this ip
@@ -846,7 +846,7 @@ static void scan_eh_tab(scan_results &results, _Unwind_Action actions,
             // Possible stack corruption.
             call_terminate(native_exception, unwind_exception);
         }
-#endif // !__USING_SJLJ_EXCEPTIONS__ && !__USING_WASM_EXCEPTIONS__
+#endif // !__USING_SJLJ_EXCEPTIONS__ && !__WASM_EXCEPTIONS__
     }  // there might be some tricky cases which break out of this loop
 
     // It is possible that no eh table entry specify how to handle
@@ -903,7 +903,7 @@ _UA_CLEANUP_PHASE
 */
 
 #if !defined(_LIBCXXABI_ARM_EHABI)
-#ifdef __USING_WASM_EXCEPTIONS__
+#ifdef __WASM_EXCEPTIONS__
 _Unwind_Reason_Code __gxx_personality_wasm0
 #elif defined(__SEH__) && !defined(__USING_SJLJ_EXCEPTIONS__)
 static _Unwind_Reason_Code __gxx_personality_imp
@@ -972,7 +972,7 @@ __gxx_personality_v0
             exc->languageSpecificData = results.languageSpecificData;
             exc->catchTemp = reinterpret_cast<void*>(results.landingPad);
             exc->adjustedPtr = results.adjustedPtr;
-#ifdef __USING_WASM_EXCEPTIONS__
+#ifdef __WASM_EXCEPTIONS__
             // Wasm only uses a single phase (_UA_SEARCH_PHASE), so save the
             // results here.
             set_registers(unwind_exception, context, results);
diff --git a/libunwind/include/__libunwind_config.h b/libunwind/include/__libunwind_config.h
index 8db336b2d727..028b9e3baa80 100644
--- a/libunwind/include/__libunwind_config.h
+++ b/libunwind/include/__libunwind_config.h
@@ -180,6 +180,10 @@
 #endif
 #define _LIBUNWIND_HIGHEST_DWARF_REGISTER                                      \
   _LIBUNWIND_HIGHEST_DWARF_REGISTER_LOONGARCH
+#elif defined(__wasm__)
+// Unused
+#define _LIBUNWIND_CONTEXT_SIZE 0
+#define _LIBUNWIND_CURSOR_SIZE 0
 # else
 #  error "Unsupported architecture."
 # endif
diff --git a/libunwind/src/Unwind-wasm.c b/libunwind/src/Unwind-wasm.c
index f7f39d38b59c..b18b32c5d178 100644
--- a/libunwind/src/Unwind-wasm.c
+++ b/libunwind/src/Unwind-wasm.c
@@ -14,7 +14,7 @@
 
 #include "config.h"
 
-#ifdef __USING_WASM_EXCEPTIONS__
+#ifdef __WASM_EXCEPTIONS__
 
 #include "unwind.h"
 #include <threads.h>
@@ -120,4 +120,4 @@ _Unwind_GetRegionStart(struct _Unwind_Context *context) {
   return 0;
 }
 
-#endif // defined(__USING_WASM_EXCEPTIONS__)
+#endif // defined(__WASM_EXCEPTIONS__)
diff --git a/libunwind/src/UnwindCursor.hpp b/libunwind/src/UnwindCursor.hpp
index 7753936a5894..66fe8e2a32cc 100644
--- a/libunwind/src/UnwindCursor.hpp
+++ b/libunwind/src/UnwindCursor.hpp
@@ -2416,7 +2416,7 @@ int UnwindCursor<A, R>::stepWithTBTable(pint_t pc, tbtable *TBTable,
     }
 
     // Reset LR in the current context.
-    newRegisters.setLR(NULL);
+    newRegisters.setLR(static_cast<uintptr_t>(NULL));
 
     _LIBUNWIND_TRACE_UNWINDING(
         "Extract info from lastStack=%p, returnAddress=%p",
diff --git a/libunwind/src/UnwindLevel1.c b/libunwind/src/UnwindLevel1.c
index 05d0f2cb0a0a..48e7bc3b9e00 100644
--- a/libunwind/src/UnwindLevel1.c
+++ b/libunwind/src/UnwindLevel1.c
@@ -31,7 +31,8 @@
 #include "libunwind_ext.h"
 #include "unwind.h"
 
-#if !defined(_LIBUNWIND_ARM_EHABI) && !defined(__USING_SJLJ_EXCEPTIONS__)
+#if !defined(_LIBUNWIND_ARM_EHABI) && !defined(__USING_SJLJ_EXCEPTIONS__) &&   \
+    !defined(__wasm__)
 
 #ifndef _LIBUNWIND_SUPPORT_SEH_UNWIND
 
diff --git a/libunwind/src/UnwindRegistersRestore.S b/libunwind/src/UnwindRegistersRestore.S
index 42c2488fc7cf..67d9e0571189 100644
--- a/libunwind/src/UnwindRegistersRestore.S
+++ b/libunwind/src/UnwindRegistersRestore.S
@@ -20,7 +20,7 @@
   .text
 #endif
 
-#if !defined(__USING_SJLJ_EXCEPTIONS__)
+#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__wasm__)
 
 #if defined(__i386__)
 DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_x86_jumpto)
@@ -1232,7 +1232,7 @@ DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind19Registers_loongarch6jumptoEv)
 
 #endif
 
-#endif /* !defined(__USING_SJLJ_EXCEPTIONS__) */
+#endif /* !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__wasm__) */
 
 NO_EXEC_STACK_DIRECTIVE
 
diff --git a/libunwind/src/UnwindRegistersSave.S b/libunwind/src/UnwindRegistersSave.S
index 19a0e87d683c..5bf6055fe414 100644
--- a/libunwind/src/UnwindRegistersSave.S
+++ b/libunwind/src/UnwindRegistersSave.S
@@ -20,7 +20,7 @@
     .text
 #endif
 
-#if !defined(__USING_SJLJ_EXCEPTIONS__)
+#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__wasm__)
 
 #if defined(__i386__)
 
@@ -1177,6 +1177,6 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
 
   WEAK_ALIAS(__unw_getcontext, unw_getcontext)
 
-#endif /* !defined(__USING_SJLJ_EXCEPTIONS__) */
+#endif /* !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__wasm__) */
 
 NO_EXEC_STACK_DIRECTIVE
diff --git a/libunwind/src/libunwind.cpp b/libunwind/src/libunwind.cpp
index 217dde909863..cf39ec5f7dbd 100644
--- a/libunwind/src/libunwind.cpp
+++ b/libunwind/src/libunwind.cpp
@@ -26,7 +26,7 @@
 #include <sanitizer/asan_interface.h>
 #endif
 
-#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__USING_WASM_EXCEPTIONS__)
+#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__wasm__)
 #include "AddressSpace.hpp"
 #include "UnwindCursor.hpp"
 
@@ -347,8 +347,7 @@ void __unw_remove_dynamic_eh_frame_section(unw_word_t eh_frame_start) {
 }
 
 #endif // defined(_LIBUNWIND_SUPPORT_DWARF_UNWIND)
-#endif // !defined(__USING_SJLJ_EXCEPTIONS__) &&
-       // !defined(__USING_WASM_EXCEPTIONS__)
+#endif // !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__wasm__)
 
 #ifdef __APPLE__
 
diff --git a/lld/ELF/Arch/AVR.cpp b/lld/ELF/Arch/AVR.cpp
index 9211eabc9669..2275f8694287 100644
--- a/lld/ELF/Arch/AVR.cpp
+++ b/lld/ELF/Arch/AVR.cpp
@@ -231,14 +231,13 @@ void AVR::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
 
   // Since every jump destination is word aligned we gain an extra bit
   case R_AVR_7_PCREL: {
-    checkInt(loc, val - 2, 7, rel);
+    checkInt(loc, val - 2, 8, rel);
     checkAlignment(loc, val, 2, rel);
     const uint16_t target = (val - 2) >> 1;
     write16le(loc, (read16le(loc) & 0xfc07) | ((target & 0x7f) << 3));
     break;
   }
   case R_AVR_13_PCREL: {
-    checkInt(loc, val - 2, 13, rel);
     checkAlignment(loc, val, 2, rel);
     const uint16_t target = (val - 2) >> 1;
     write16le(loc, (read16le(loc) & 0xf000) | (target & 0xfff));
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index dbb81412453a..f0dfe7f377de 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -102,6 +102,9 @@ enum class GnuStackKind { None, Exec, NoExec };
 // For --lto=
 enum LtoKind : uint8_t {UnifiedThin, UnifiedRegular, Default};
 
+// For -z gcs=
+enum class GcsPolicy { Implicit, Never, Always };
+
 struct SymbolVersion {
   llvm::StringRef name;
   bool isExternCpp;
@@ -188,6 +191,7 @@ struct Config {
   StringRef zBtiReport = "none";
   StringRef zCetReport = "none";
   StringRef zPauthReport = "none";
+  StringRef zGcsReport = "none";
   bool ltoBBAddrMap;
   llvm::StringRef ltoBasicBlockSections;
   std::pair<llvm::StringRef, llvm::StringRef> thinLTOObjectSuffixReplace;
@@ -341,6 +345,7 @@ struct Config {
   UnresolvedPolicy unresolvedSymbols;
   UnresolvedPolicy unresolvedSymbolsInShlib;
   Target2Policy target2;
+  GcsPolicy zGcs;
   bool power10Stubs;
   ARMVFPArgKind armVFPArgs = ARMVFPArgKind::Default;
   BuildIdKind buildId = BuildIdKind::None;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 028cdcc83d2f..ddc574a11314 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -466,6 +466,10 @@ static void checkOptions() {
       error("-z bti-report only supported on AArch64");
     if (config->zPauthReport != "none")
       error("-z pauth-report only supported on AArch64");
+    if (config->zGcsReport != "none")
+      error("-z gcs-report only supported on AArch64");
+    if (config->zGcs != GcsPolicy::Implicit)
+      error("-z gcs only supported on AArch64");
   }
 
   if (config->emachine != EM_386 && config->emachine != EM_X86_64 &&
@@ -560,6 +564,25 @@ static uint8_t getZStartStopVisibility(opt::InputArgList &args) {
   return ret;
 }
 
+static GcsPolicy getZGcs(opt::InputArgList &args) {
+  GcsPolicy ret = GcsPolicy::Implicit;
+  for (auto *arg : args.filtered(OPT_z)) {
+    std::pair<StringRef, StringRef> kv = StringRef(arg->getValue()).split('=');
+    if (kv.first == "gcs") {
+      arg->claim();
+      if (kv.second == "implicit")
+        ret = GcsPolicy::Implicit;
+      else if (kv.second == "never")
+        ret = GcsPolicy::Never;
+      else if (kv.second == "always")
+        ret = GcsPolicy::Always;
+      else
+        error("unknown -z gcs= value: " + kv.second);
+    }
+  }
+  return ret;
+}
+
 // Report a warning for an unknown -z option.
 static void checkZOptions(opt::InputArgList &args) {
   // This function is called before getTarget(), when certain options are not
@@ -1438,6 +1461,7 @@ static void readConfigs(opt::InputArgList &args) {
   config->zCopyreloc = getZFlag(args, "copyreloc", "nocopyreloc", true);
   config->zForceBti = hasZOption(args, "force-bti");
   config->zForceIbt = hasZOption(args, "force-ibt");
+  config->zGcs = getZGcs(args);
   config->zGlobal = hasZOption(args, "global");
   config->zGnustack = getZGnuStack(args);
   config->zHazardplt = hasZOption(args, "hazardplt");
@@ -1510,6 +1534,7 @@ static void readConfigs(opt::InputArgList &args) {
 
   auto reports = {std::make_pair("bti-report", &config->zBtiReport),
                   std::make_pair("cet-report", &config->zCetReport),
+                  std::make_pair("gcs-report", &config->zGcsReport),
                   std::make_pair("pauth-report", &config->zPauthReport)};
   for (opt::Arg *arg : args.filtered(OPT_z)) {
     std::pair<StringRef, StringRef> option =
@@ -2678,6 +2703,11 @@ static void readSecurityNotes() {
                       "GNU_PROPERTY_AARCH64_FEATURE_1_BTI property");
 
     checkAndReportMissingFeature(
+        config->zGcsReport, features, GNU_PROPERTY_AARCH64_FEATURE_1_GCS,
+        toString(f) + ": -z gcs-report: file does not have "
+                      "GNU_PROPERTY_AARCH64_FEATURE_1_GCS property");
+
+    checkAndReportMissingFeature(
         config->zCetReport, features, GNU_PROPERTY_X86_FEATURE_1_IBT,
         toString(f) + ": -z cet-report: file does not have "
                       "GNU_PROPERTY_X86_FEATURE_1_IBT property");
@@ -2729,6 +2759,12 @@ static void readSecurityNotes() {
   // Force enable Shadow Stack.
   if (config->zShstk)
     config->andFeatures |= GNU_PROPERTY_X86_FEATURE_1_SHSTK;
+
+  // Force enable/disable GCS
+  if (config->zGcs == GcsPolicy::Always)
+    config->andFeatures |= GNU_PROPERTY_AARCH64_FEATURE_1_GCS;
+  else if (config->zGcs == GcsPolicy::Never)
+    config->andFeatures &= ~GNU_PROPERTY_AARCH64_FEATURE_1_GCS;
 }
 
 static void initSectionsAndLocalSyms(ELFFileBase *file, bool ignoreComdats) {
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td
index 883a6079bf50..ff61a566f52f 100644
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -69,6 +69,7 @@ defm compress_debug_sections:
 
 defm compress_sections: EEq<"compress-sections",
   "Compress output sections that match the glob and do not have the SHF_ALLOC flag. "
+  "The sections remain uncompressed if compressed content would be larger. "
   "The compression level is <level> (if specified) or a default speed-focused level">,
   MetaVarName<"<section-glob>={none,zlib,zstd}[:level]">;
 
diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
index fcb4c4387aa9..60de10061c53 100644
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -344,9 +344,10 @@ template <class ELFT> void OutputSection::maybeCompress() {
   (void)sizeof(Elf_Chdr);
 
   DebugCompressionType ctype = DebugCompressionType::None;
+  size_t compressedSize = sizeof(Elf_Chdr);
   unsigned level = 0; // default compression level
   if (!(flags & SHF_ALLOC) && config->compressDebugSections &&
-      name.starts_with(".debug_") && size)
+      name.starts_with(".debug_"))
     ctype = *config->compressDebugSections;
   for (auto &[glob, t, l] : config->compressSections)
     if (glob.match(name))
@@ -360,7 +361,6 @@ template <class ELFT> void OutputSection::maybeCompress() {
   }
 
   llvm::TimeTraceScope timeScope("Compress sections");
-  compressed.uncompressedSize = size;
   auto buf = std::make_unique<uint8_t[]>(size);
   // Write uncompressed data to a temporary zero-initialized buffer.
   {
@@ -378,7 +378,6 @@ template <class ELFT> void OutputSection::maybeCompress() {
   [[maybe_unused]] constexpr size_t shardSize = 1 << 20;
   auto shardsIn = split(ArrayRef<uint8_t>(buf.get(), size), shardSize);
   const size_t numShards = shardsIn.size();
-  compressed.numShards = numShards;
   auto shardsOut = std::make_unique<SmallVector<uint8_t, 0>[]>(numShards);
 
 #if LLVM_ENABLE_ZSTD
@@ -409,9 +408,8 @@ template <class ELFT> void OutputSection::maybeCompress() {
       shardsOut[i] = std::move(out);
     });
     compressed.type = ELFCOMPRESS_ZSTD;
-    size = sizeof(Elf_Chdr);
     for (size_t i = 0; i != numShards; ++i)
-      size += shardsOut[i].size();
+      compressedSize += shardsOut[i].size();
   }
 #endif
 
@@ -434,18 +432,23 @@ template <class ELFT> void OutputSection::maybeCompress() {
 
     // Update section size and combine Alder-32 checksums.
     uint32_t checksum = 1;       // Initial Adler-32 value
-    size = sizeof(Elf_Chdr) + 2; // Elf_Chdir and zlib header
+    compressedSize += 2;         // Elf_Chdir and zlib header
     for (size_t i = 0; i != numShards; ++i) {
-      size += shardsOut[i].size();
+      compressedSize += shardsOut[i].size();
       checksum = adler32_combine(checksum, shardsAdler[i], shardsIn[i].size());
     }
-    size += 4; // checksum
+    compressedSize += 4; // checksum
     compressed.type = ELFCOMPRESS_ZLIB;
     compressed.checksum = checksum;
   }
 #endif
 
+  if (compressedSize >= size)
+    return;
+  compressed.uncompressedSize = size;
   compressed.shards = std::move(shardsOut);
+  compressed.numShards = numShards;
+  size = compressedSize;
   flags |= SHF_COMPRESSED;
 }
 
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index 0df13f07f560..da3b926d02a2 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -148,6 +148,7 @@ Alias for
 .Fl -color-diagnostics Ns = Ns Cm auto .
 .It Fl -compress-debug-sections Ns = Ns Ar value
 Compress DWARF debug sections.
+The sections remain uncompressed if compressed content would be larger.
 .Cm value
 may be
 .Pp
@@ -163,6 +164,7 @@ Use the default compression level in zstd.
 .Pp
 .It Fl -compress-sections Ns = Ns Ar section-glob={none,zlib,zstd}[:level]
 Compress output sections that match the glob and do not have the SHF_ALLOC flag.
+The matched sections remain uncompressed if compressed content would be larger.
 The compression level is
 .Cm level
 (if specified) or a default speed-focused level.
@@ -420,9 +422,7 @@ Disable string merging.
 .It Cm 1
 Enable string merging.
 .It Cm 2
-Enable string tail merging. If
-.Fl -compress-debug-sections
-is given, compress debug sections at compression level 6 instead of 1.
+Enable string tail merging.
 .El
 .Pp
 .Fl O Ns Cm 1
diff --git a/lld/test/ELF/aarch64-feature-gcs.s b/lld/test/ELF/aarch64-feature-gcs.s
new file mode 100644
index 000000000000..7a08673dbb7e
--- /dev/null
+++ b/lld/test/ELF/aarch64-feature-gcs.s
@@ -0,0 +1,134 @@
+# REQUIRES: aarch64
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu func1-gcs.s -o func1-gcs.o
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu func2.s -o func2.o
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu func2-gcs.s -o func2-gcs.o
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu func3.s -o func3.o
+# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu func3-gcs.s -o func3-gcs.o
+
+## GCS should be enabled when it's enabled in all inputs or when it's forced on.
+
+# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -o gcs
+# RUN: llvm-readelf -n gcs | FileCheck --check-prefix GCS %s
+# RUN: ld.lld func1-gcs.o func3-gcs.o --shared -o gcs.so
+# RUN: llvm-readelf -n gcs.so | FileCheck --check-prefix GCS %s
+# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -o force-gcs -z gcs=always
+# RUN: llvm-readelf -n force-gcs | FileCheck --check-prefix GCS %s
+# RUN: ld.lld func2-gcs.o func3.o --shared -o force-gcs.so -z gcs=always
+# RUN: llvm-readelf -n force-gcs.so | FileCheck --check-prefix GCS %s
+# RUN: ld.lld func2-gcs.o func3.o --shared -o force-gcs2.so -z gcs=never -z gcs=always
+# RUN: llvm-readelf -n force-gcs2.so | FileCheck --check-prefix GCS %s
+
+# GCS: Properties:    aarch64 feature: GCS
+
+## GCS should not be enabled if it's not enabled in at least one input.
+
+# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -o no-gcs
+# RUN: llvm-readelf -n no-gcs | count 0
+# RUN: ld.lld func2-gcs.o func3.o --shared -o no-gcs.so
+
+## GCS should be disabled with gcs=never, even if GCS is present in all inputs.
+
+# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -z gcs=never -o never-gcs
+# RUN: llvm-readelf -n never-gcs | count 0
+# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -z gcs=always -z gcs=never -o never-gcs2
+# RUN: llvm-readelf -n never-gcs2 | count 0
+
+## gcs-report should report any input files that don't have the gcs property.
+
+# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -o /dev/null -z gcs-report=warning 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
+# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -o /dev/null -z gcs-report=warning -z gcs=always 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
+# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -o /dev/null -z gcs-report=warning -z gcs=never 2>&1 | FileCheck --check-prefix=REPORT-WARN %s
+# RUN: not ld.lld func2-gcs.o func3.o --shared -o /dev/null -z gcs-report=error 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s
+# RUN: not ld.lld func2-gcs.o func3.o --shared -o /dev/null -z gcs-report=error -z gcs=always 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s
+# RUN: not ld.lld func2-gcs.o func3.o --shared -o /dev/null -z gcs-report=error -z gcs=never 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s
+# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -o /dev/null -z gcs-report=warning 2>&1 | count 0
+# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -o /dev/null -z gcs-report=warning -z gcs=always 2>&1 | count 0
+# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -o /dev/null -z gcs-report=warning -z gcs=never 2>&1 | count 0
+
+# REPORT-WARN: warning: func2.o: -z gcs-report: file does not have GNU_PROPERTY_AARCH64_FEATURE_1_GCS property
+# REPORT-ERROR: error: func3.o: -z gcs-report: file does not have GNU_PROPERTY_AARCH64_FEATURE_1_GCS property
+
+## An invalid gcs option should give an error
+# RUN: not ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -z gcs=nonsense 2>&1 | FileCheck --check-prefix=INVALID %s
+
+# INVALID: error: unknown -z gcs= value: nonsense
+
+#--- func1-gcs.s
+.section ".note.gnu.property", "a"
+.long 4
+.long 0x10
+.long 0x5
+.asciz "GNU"
+
+.long 0xc0000000 // GNU_PROPERTY_AARCH64_FEATURE_1_AND
+.long 4
+.long 4          // GNU_PROPERTY_AARCH64_FEATURE_1_GCS
+.long 0
+
+.text
+.globl _start
+.type func1,%function
+func1:
+  bl func2
+  ret
+
+#--- func2.s
+
+.text
+.globl func2
+.type func2,@function
+func2:
+  .globl func3
+  .type func3, @function
+  bl func3
+  ret
+
+#--- func2-gcs.s
+
+.section ".note.gnu.property", "a"
+.long 4
+.long 0x10
+.long 0x5
+.asciz "GNU"
+
+.long 0xc0000000 // GNU_PROPERTY_AARCH64_FEATURE_1_AND
+.long 4
+.long 4          // GNU_PROPERTY_AARCH64_FEATURE_1_GCS
+.long 0
+
+.text
+.globl func2
+.type func2,@function
+func2:
+  .globl func3
+  .type func3, @function
+  bl func3
+  ret
+
+#--- func3.s
+
+.text
+.globl func3
+.type func3,@function
+func3:
+  ret
+
+#--- func3-gcs.s
+
+.section ".note.gnu.property", "a"
+.long 4
+.long 0x10
+.long 0x5
+.asciz "GNU"
+
+.long 0xc0000000 // GNU_PROPERTY_AARCH64_FEATURE_1_AND
+.long 4
+.long 4          // GNU_PROPERTY_AARCH64_FEATURE_1_GCS
+.long 0
+
+.text
+.globl func3
+.type func3,@function
+func3:
+  ret
diff --git a/lld/test/ELF/avr-reloc-error.s b/lld/test/ELF/avr-reloc-error.s
index 0a30f68d168e..f177e44f753f 100644
--- a/lld/test/ELF/avr-reloc-error.s
+++ b/lld/test/ELF/avr-reloc-error.s
@@ -3,7 +3,7 @@
 # RUN: rm -rf %t && split-file %s %t && cd %t
 
 # RUN: llvm-mc -filetype=obj -triple=avr -mcpu=atmega328 avr-pcrel-7.s -o avr-pcrel-7.o
-# RUN: not ld.lld avr-pcrel-7.o -o /dev/null -Ttext=0x1000 --defsym=callee0=0x1040 --defsym=callee1=0x1044 --defsym=callee2=0x100f 2>&1 | \
+# RUN: not ld.lld avr-pcrel-7.o -o /dev/null -Ttext=0x1000 --defsym=callee0=0x1040 --defsym=callee1=0x1084 --defsym=callee2=0x100f 2>&1 | \
 # RUN:     FileCheck %s --check-prefix=PCREL7
 # RUN: llvm-mc -filetype=obj -triple=avr -mcpu=atmega328 avr-pcrel-13.s -o avr-pcrel-13.o
 # RUN: not ld.lld avr-pcrel-13.o -o /dev/null -Ttext=0x1000 --defsym=callee0=0x2000 --defsym=callee1=0x2004 --defsym=callee2=0x100f 2>&1 | \
@@ -20,7 +20,7 @@
 __start:
 
 # PCREL7-NOT: callee0
-# PCREL7:     error: {{.*}} relocation R_AVR_7_PCREL out of range: {{.*}} is not in [-64, 63]; references 'callee1'
+# PCREL7:     error: {{.*}} relocation R_AVR_7_PCREL out of range: {{.*}} is not in [-128, 127]; references 'callee1'
 # PCREL7:     error: {{.*}} improper alignment for relocation R_AVR_7_PCREL: {{.*}} is not aligned to 2 bytes
 brne callee0
 breq callee1
@@ -34,7 +34,6 @@ brlt callee2
 __start:
 
 # PCREL13-NOT: callee0
-# PCREL13:     error: {{.*}} relocation R_AVR_13_PCREL out of range: {{.*}} is not in [-4096, 4095]; references 'callee1'
 # PCREL13:     error: {{.*}} improper alignment for relocation R_AVR_13_PCREL: {{.*}} is not aligned to 2 bytes
 rjmp  callee0
 rcall callee1
diff --git a/lld/test/ELF/avr-reloc.s b/lld/test/ELF/avr-reloc.s
index 172c0e03ba74..ec088eaa149d 100644
--- a/lld/test/ELF/avr-reloc.s
+++ b/lld/test/ELF/avr-reloc.s
@@ -82,6 +82,12 @@ sbic  b, 1    ; R_AVR_PORT5
 ; CHECK-NEXT:  rjmp .-36
 ; CHECK-NEXT:  breq .+26
 ; CHECK-NEXT:  breq .-40
+; CHECK-NEXT:  rjmp .-4096
+; CHECK-NEXT:  rjmp .+4094
+; CHECK-NEXT:  rjmp .+4094
+; CHECK-NEXT:  rjmp .-4096
+; CHECK-NEXT:  breq .-128
+; CHECK-NEXT:  breq .+126
 ; HEX-LABEL:   section .PCREL:
 ; HEX-NEXT:    0fc0eecf 69f061f3
 foo:
@@ -89,6 +95,12 @@ rjmp foo + 32  ; R_AVR_13_PCREL
 rjmp foo - 32  ; R_AVR_13_PCREL
 breq foo + 32  ; R_AVR_7_PCREL
 breq foo - 32  ; R_AVR_7_PCREL
+rjmp 1f - 4096  $ 1:  ; R_AVR_13_PCREL
+rjmp 1f + 4094  $ 1:  ; R_AVR_13_PCREL
+rjmp 1f - 4098  $ 1:  ; R_AVR_13_PCREL (overflow)
+rjmp 1f + 4096  $ 1:  ; R_AVR_13_PCREL (overflow)
+breq 1f - 128   $ 1:  ; R_AVR_7_PCREL
+breq 1f + 126   $ 1:  ; R_AVR_7_PCREL
 
 .section .LDSSTS,"ax",@progbits
 ; CHECK-LABEL: section .LDSSTS:
diff --git a/lld/test/ELF/compress-debug-sections-zstd.s b/lld/test/ELF/compress-debug-sections-zstd.s
index 97ab192a52f4..d9f29af99974 100644
--- a/lld/test/ELF/compress-debug-sections-zstd.s
+++ b/lld/test/ELF/compress-debug-sections-zstd.s
@@ -3,22 +3,25 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64 --compress-debug-sections=zstd %s -o %t.o
 
 # RUN: ld.lld %t.o -o %t.so -shared
-# RUN: llvm-readelf -S -x .debug_str %t.so | FileCheck %s
+# RUN: llvm-readelf -S -p .debug_str %t.so | FileCheck %s
 
 # CHECK:      .debug_str    PROGBITS [[#%x,]] [[#%x,]] [[#%x,]] 01 MS  0 0  1
-# CHECK:      Hex dump of section '.debug_str':
-# CHECK-NEXT: 0x00000000 73686f72 7420756e 7369676e 65642069 short unsigned i
-# CHECK-NEXT: 0x00000010 6e740075 6e736967 6e656420 63686172 nt.unsigned char
-# CHECK-NEXT: 0x00000020 00636861 72006c6f 6e672075 6e736967 .char.long unsig
-# CHECK-NEXT: 0x00000030 6e656420 696e7400 756e7369 676e6564 ned int.unsigned
-# CHECK-NEXT: 0x00000040 20696e74 00                          int.
+# CHECK:      String dump of section '.debug_str':
+# CHECK-NEXT: [     0] {{A+}}
+# CHECK-NEXT: [    81] short unsigned int
+# CHECK-NEXT: [    94] unsigned char
+# CHECK-NEXT: [    a2] char
+# CHECK-NEXT: [    a7] long unsigned int
+# CHECK-NEXT: [    b9] unsigned int
 
 # RUN: ld.lld %t.o -o %t.so -shared --compress-debug-sections=zstd
 # RUN: llvm-readelf -S %t.so | FileCheck %s --check-prefix=OUTPUT-SEC
 # RUN: llvm-objcopy --decompress-debug-sections %t.so
-# RUN: llvm-readelf -S -x .debug_str %t.so | FileCheck %s
+# RUN: llvm-readelf -S -p .debug_str %t.so | FileCheck %s
 
-# OUTPUT-SEC: .debug_str    PROGBITS [[#%x,]] [[#%x,]] [[#%x,]] 01 MSC 0 0  1
+# OUTPUT-SEC:      .debug_str    PROGBITS [[#%x,]] [[#%x,]] [[#%x,]] 01 MSC 0 0  1
+# OUTPUT-SEC-NEXT: .debug_frame  PROGBITS [[#%x,]] [[#%x,]] 000000   00     0 0  1
+# OUTPUT-SEC-NEXT: .debug_loc    PROGBITS [[#%x,]] [[#%x,]] 000010   00     0 0  1
 
 .section .debug_str,"MS",@progbits,1
 .LASF2:
@@ -31,3 +34,11 @@
  .string "char"
 .LASF1:
  .string "unsigned char"
+.Lunused:
+ .fill 128, 1, 0x41
+ .byte 0
+
+## Test sections where compressed content would be larger.
+.section .debug_frame,""
+.section .debug_loc,""
+.space 16
diff --git a/lld/test/ELF/compress-sections-special.s b/lld/test/ELF/compress-sections-special.s
index 80c61fe626a4..7e474ac7c7d6 100644
--- a/lld/test/ELF/compress-sections-special.s
+++ b/lld/test/ELF/compress-sections-special.s
@@ -14,7 +14,7 @@
 # CHECK:      warning: {{.*}}: unable to get the string table for the SHT_SYMTAB section: SHT_STRTAB string table section
 
 # CHECK:      Hex dump of section '.strtab':
-# CHECK-NEXT: 01000000 00000000 1a000000 00000000
+# CHECK-NEXT: 01000000 00000000 5c000000 00000000
 # CHECK-NEXT: 01000000 00000000 {{.*}}
 
 # RUN: not ld.lld -shared a.o --compress-sections .dynstr=zlib 2>&1 | FileCheck %s --check-prefix=ERR-ALLOC
@@ -25,6 +25,8 @@ _start:
 l0:
 g0:
 g1:
+.globl ggggggggggggggggggggggggggggggg0
+.globl ggggggggggggggggggggggggggggggg1
 
 .section nonalloc0,""
 .quad .text+1
diff --git a/lld/test/ELF/compress-sections.s b/lld/test/ELF/compress-sections.s
index aa30c7a90474..aaad31476044 100644
--- a/lld/test/ELF/compress-sections.s
+++ b/lld/test/ELF/compress-sections.s
@@ -11,10 +11,11 @@
 # CHECK1-NEXT: .text      PROGBITS [[#%x,TEXT:]]    [[#%x,]] [[#%x,]] 00 AX   0   0  4
 # CHECK1:      nonalloc0  PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
 # CHECK1-NEXT: nonalloc1  PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
+# CHECK1-NEXT: smallc0    PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
 # CHECK1-NEXT: .debug_str PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01 MS   0   0  1
 
-# CHECK1: 0000000000000010  0 NOTYPE  LOCAL  DEFAULT   [[#]] (nonalloc0) sym0
-# CHECK1: 0000000000000008  0 NOTYPE  LOCAL  DEFAULT   [[#]] (nonalloc1) sym1
+# CHECK1: 0000000000000090  0 NOTYPE  LOCAL  DEFAULT   [[#]] (nonalloc0) sym0
+# CHECK1: 0000000000000088  0 NOTYPE  LOCAL  DEFAULT   [[#]] (nonalloc1) sym1
 
 # RUN: ld.lld -pie a.o --compress-sections '*c0=zlib' --compress-sections .debug_str=zstd:3 -o out2
 # RUN: llvm-readelf -SrsX -x nonalloc0 -x .debug_str out2 | FileCheck %s --check-prefix=CHECK2
@@ -24,15 +25,16 @@
 # CHECK2-NEXT: foo1       PROGBITS [[#%x,FOO1:]]    [[#%x,]] [[#%x,]] 00 A    0   0  8
 # CHECK2-NEXT: .text      PROGBITS [[#%x,TEXT:]]    [[#%x,]] [[#%x,]] 00 AX   0   0  4
 # CHECK2:      nonalloc0  PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00 C    0   0  1
-# CHECK2-NEXT: nonalloc1  PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 00      0   0  8
+# CHECK2-NEXT: nonalloc1  PROGBITS 0000000000000000 [[#%x,]] 000088   00      0   0  8
+# CHECK2-NEXT: smallc0    PROGBITS 0000000000000000 [[#%x,]] 00000c   00      0   0  1
 # CHECK2-NEXT: .debug_str PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01 MSC  0   0  1
 
-# CHECK2: 0000000000000010  0 NOTYPE  LOCAL  DEFAULT   [[#]] (nonalloc0) sym0
-# CHECK2: 0000000000000008  0 NOTYPE  LOCAL  DEFAULT   [[#]] (nonalloc1) sym1
+# CHECK2: 0000000000000090  0 NOTYPE  LOCAL  DEFAULT   [[#]] (nonalloc0) sym0
+# CHECK2: 0000000000000088  0 NOTYPE  LOCAL  DEFAULT   [[#]] (nonalloc1) sym1
 
 # CHECK2:      Hex dump of section 'nonalloc0':
-## zlib with ch_size=0x10
-# CHECK2-NEXT: 01000000 00000000 10000000 00000000
+## zlib with ch_size=0x90
+# CHECK2-NEXT: 01000000 00000000 90000000 00000000
 # CHECK2-NEXT: 01000000 00000000 {{.*}}
 # CHECK2:      Hex dump of section '.debug_str':
 ## zstd with ch_size=0x38
@@ -80,20 +82,28 @@ _start:
 .balign 8
 .quad .text-.
 .quad .text-.
+.space 128
 .section foo1,"a"
 .balign 8
 .quad .text-.
 .quad .text-.
+.space 128
 .section nonalloc0,""
 .balign 8
 .quad .text+1
 .quad .text+2
+.space 128
 sym0:
 .section nonalloc1,""
 .balign 8
 .quad 42
+.space 128
 sym1:
 
+.section smallc0,""
+.balign 8
+.space 12
+
 .section .debug_str,"MS",@progbits,1
 .Linfo_string0:
   .asciz "AAAAAAAAAAAAAAAAAAAAAAAAAAA"
diff --git a/lld/test/ELF/compressed-debug-level.test b/lld/test/ELF/compressed-debug-level.test
index ce3a194bd7c2..5a4d37e31eca 100644
--- a/lld/test/ELF/compressed-debug-level.test
+++ b/lld/test/ELF/compressed-debug-level.test
@@ -18,8 +18,8 @@
 # RUN: llvm-readelf --sections %t.6 | FileCheck -check-prefixes=HEADER,LEVEL6 %s
 
 # HEADER: [Nr] Name        Type     Address  Off    Size
-# LEVEL1: [ 1] .debug_info PROGBITS 00000000 000094 00001{{[bc]}}
-# LEVEL6: [ 1] .debug_info PROGBITS 00000000 000094 00001a
+# LEVEL1: [ 1] .debug_info PROGBITS 00000000 000094 0000{{1[def]|21}}
+# LEVEL6: [ 1] .debug_info PROGBITS 00000000 000094 00001{{[abc]}}
 
 ## A little arbitrary debug section which has a different size after
 ## applying compression of level 1 and 6.
@@ -33,4 +33,4 @@ FileHeader:
 Sections:
   - Name:    .debug_info
     Type:    SHT_PROGBITS
-    Content: '010101010101010201010201'
+    Content: '010101010101010201010201010101010101010201010201010101010101010201010201'
diff --git a/lld/test/ELF/linkerscript/compress-debug-sections.s b/lld/test/ELF/linkerscript/compress-debug-sections.s
index fe1c66dbdbdc..8d06689cc871 100644
--- a/lld/test/ELF/linkerscript/compress-debug-sections.s
+++ b/lld/test/ELF/linkerscript/compress-debug-sections.s
@@ -34,3 +34,5 @@
 .section .debug_str,"MS",@progbits,1
   .asciz "AAA"
   .asciz "BBB"
+  .fill 64,1,0x41
+  .byte 0
diff --git a/lld/test/ELF/linkerscript/compress-sections.s b/lld/test/ELF/linkerscript/compress-sections.s
index 9b4574a1778c..5131fa754224 100644
--- a/lld/test/ELF/linkerscript/compress-sections.s
+++ b/lld/test/ELF/linkerscript/compress-sections.s
@@ -10,10 +10,11 @@
 # CHECK-NEXT: str      PROGBITS 0000000000000000 [[#%x,]] [[#%x,]] 01 MSC   0   0  1
 
 # CHECK:      0000000000000000  0 NOTYPE  GLOBAL DEFAULT [[#]] (nonalloc) nonalloc_start
-# CHECK:      0000000000000023  0 NOTYPE  GLOBAL DEFAULT [[#]] (nonalloc) nonalloc_end
+# CHECK:      0000000000000063  0 NOTYPE  GLOBAL DEFAULT [[#]] (nonalloc) nonalloc_end
 # CHECK:      String dump of section 'str':
 # CHECK-NEXT: [     0] AAA
-# CHECK-NEXT: [     4] BBB
+# CHECK-NEXT: [     4] {{a+}}
+# CHECK-NEXT: [    45] BBB
 
 ## TODO The uncompressed size of 'nonalloc' is dependent on linker script
 ## commands, which is not handled. We should report an error.
@@ -28,6 +29,7 @@ _start:
 .balign 8
 .quad .text
 .quad .text
+.space 64
 .section nonalloc1,""
 .balign 8
 .quad 42
@@ -35,6 +37,8 @@ _start:
 .section str,"MS",@progbits,1
   .asciz "AAA"
   .asciz "BBB"
+  .fill 64,1,0x61
+  .byte 0
 
 #--- a.lds
 SECTIONS {
diff --git a/lld/test/wasm/shared64.s b/lld/test/wasm/shared64.s
index 3401faed8610..73f77436cabf 100644
--- a/lld/test/wasm/shared64.s
+++ b/lld/test/wasm/shared64.s
@@ -154,6 +154,7 @@ get_local_func_address:
 # CHECK-NEXT:           Index:           0
 # CHECK-NEXT:           ElemType:        FUNCREF
 # CHECK-NEXT:           Limits:
+# CHECK-NEXT:             Flags:         [ IS_64 ]
 # CHECK-NEXT:             Minimum:         0x2
 # CHECK-NEXT:       - Module:          env
 # CHECK-NEXT:         Field:           __stack_pointer
@@ -170,11 +171,6 @@ get_local_func_address:
 # CHECK-NEXT:         Kind:            GLOBAL
 # CHECK-NEXT:         GlobalType:      I64
 # CHECK-NEXT:         GlobalMutable:   false
-# CHECK-NEXT:       - Module:          env
-# CHECK-NEXT:         Field:           __table_base32
-# CHECK-NEXT:         Kind:            GLOBAL
-# CHECK-NEXT:         GlobalType:      I32
-# CHECK-NEXT:         GlobalMutable:   false
 # CHECK-NEXT:       - Module:          GOT.mem
 # CHECK-NEXT:         Field:           indirect_func
 # CHECK-NEXT:         Kind:            GLOBAL
@@ -209,7 +205,7 @@ get_local_func_address:
 # CHECK-NEXT:     Segments:
 # CHECK-NEXT:       - Offset:
 # CHECK-NEXT:           Opcode:          GLOBAL_GET
-# CHECK-NEXT:           Index:           3
+# CHECK-NEXT:           Index:           2
 # CHECK-NEXT:         Functions:       [ 3, 2 ]
 
 # check the generated code in __wasm_call_ctors and __wasm_apply_data_relocs functions
@@ -223,7 +219,7 @@ get_local_func_address:
 # DIS-NEXT:                 i64.const       4
 # DIS-NEXT:                 global.get      1
 # DIS-NEXT:                 i64.add
-# DIS-NEXT:                 global.get      5
+# DIS-NEXT:                 global.get      4
 # DIS-NEXT:                 i64.store       0:p2align=2
 # DIS-NEXT:                 i64.const       12
 # DIS-NEXT:                 global.get      1
@@ -242,12 +238,12 @@ get_local_func_address:
 # DIS-NEXT:                 i64.const       24
 # DIS-NEXT:                 global.get      1
 # DIS-NEXT:                 i64.add
-# DIS-NEXT:                 global.get      6
+# DIS-NEXT:                 global.get      5
 # DIS-NEXT:                 i64.store       0:p2align=2
 # DIS-NEXT:                 i64.const       32
 # DIS-NEXT:                 global.get      1
 # DIS-NEXT:                 i64.add
-# DIS-NEXT:                 global.get      7
+# DIS-NEXT:                 global.get      6
 # DIS-NEXT:                 i32.const       4
 # DIS-NEXT:                 i32.add
 # DIS-NEXT:                 i32.store       0
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index d5d763b0a4ae..cc79f80d005d 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -870,13 +870,6 @@ static void createSyntheticSymbols() {
     WasmSym::tableBase = createUndefinedGlobal("__table_base", globalType);
     WasmSym::memoryBase->markLive();
     WasmSym::tableBase->markLive();
-    if (is64) {
-      WasmSym::tableBase32 =
-          createUndefinedGlobal("__table_base32", &globalTypeI32);
-      WasmSym::tableBase32->markLive();
-    } else {
-      WasmSym::tableBase32 = nullptr;
-    }
   } else {
     // For non-PIC code
     WasmSym::stackPointer = createGlobalVariable("__stack_pointer", true);
@@ -923,9 +916,6 @@ static void createOptionalSymbols() {
     WasmSym::heapEnd = symtab->addOptionalDataSymbol("__heap_end");
     WasmSym::definedMemoryBase = symtab->addOptionalDataSymbol("__memory_base");
     WasmSym::definedTableBase = symtab->addOptionalDataSymbol("__table_base");
-    if (config->is64.value_or(false))
-      WasmSym::definedTableBase32 =
-          symtab->addOptionalDataSymbol("__table_base32");
   }
 
   // For non-shared memory programs we still need to define __tls_base since we
diff --git a/lld/wasm/Symbols.cpp b/lld/wasm/Symbols.cpp
index ace6bade02d4..687728d00c85 100644
--- a/lld/wasm/Symbols.cpp
+++ b/lld/wasm/Symbols.cpp
@@ -96,8 +96,6 @@ GlobalSymbol *WasmSym::tlsSize;
 GlobalSymbol *WasmSym::tlsAlign;
 UndefinedGlobal *WasmSym::tableBase;
 DefinedData *WasmSym::definedTableBase;
-UndefinedGlobal *WasmSym::tableBase32;
-DefinedData *WasmSym::definedTableBase32;
 UndefinedGlobal *WasmSym::memoryBase;
 DefinedData *WasmSym::definedMemoryBase;
 TableSymbol *WasmSym::indirectFunctionTable;
diff --git a/lld/wasm/Symbols.h b/lld/wasm/Symbols.h
index 38586bbd1323..65a062b8321b 100644
--- a/lld/wasm/Symbols.h
+++ b/lld/wasm/Symbols.h
@@ -603,11 +603,6 @@ struct WasmSym {
   // Used in PIC code for offset of indirect function table
   static UndefinedGlobal *tableBase;
   static DefinedData *definedTableBase;
-  // 32-bit copy in wasm64 to work around init expr limitations.
-  // These can potentially be removed again once we have
-  // https://github.com/WebAssembly/extended-const 
-  static UndefinedGlobal *tableBase32;
-  static DefinedData *definedTableBase32;
 
   // __memory_base
   // Used in PIC code for offset of global data
diff --git a/lld/wasm/SyntheticSections.cpp b/lld/wasm/SyntheticSections.cpp
index 72e255951608..b359e0fdc856 100644
--- a/lld/wasm/SyntheticSections.cpp
+++ b/lld/wasm/SyntheticSections.cpp
@@ -584,12 +584,10 @@ void ElemSection::writeBody() {
   initExpr.Extended = false;
   if (ctx.isPic) {
     initExpr.Inst.Opcode = WASM_OPCODE_GLOBAL_GET;
-    initExpr.Inst.Value.Global =
-        (config->is64.value_or(false) ? WasmSym::tableBase32
-                                      : WasmSym::tableBase)
-            ->getGlobalIndex();
+    initExpr.Inst.Value.Global = WasmSym::tableBase->getGlobalIndex();
   } else {
-    initExpr.Inst.Opcode = WASM_OPCODE_I32_CONST;
+    bool is64 = config->is64.value_or(false);
+    initExpr.Inst.Opcode = is64 ? WASM_OPCODE_I64_CONST : WASM_OPCODE_I32_CONST;
     initExpr.Inst.Value.Int32 = config->tableBase;
   }
   writeInitExpr(os, initExpr);
diff --git a/lld/wasm/Writer.cpp b/lld/wasm/Writer.cpp
index 55eff995fb8a..7a015764b77c 100644
--- a/lld/wasm/Writer.cpp
+++ b/lld/wasm/Writer.cpp
@@ -939,6 +939,8 @@ static void finalizeIndirectFunctionTable() {
     limits.Flags |= WASM_LIMITS_FLAG_HAS_MAX;
     limits.Maximum = limits.Minimum;
   }
+  if (config->is64.value_or(false))
+    limits.Flags |= WASM_LIMITS_FLAG_IS_64;
   WasmSym::indirectFunctionTable->setLimits(limits);
 }
 
@@ -1691,12 +1693,8 @@ void Writer::createSyntheticSectionsPostLayout() {
 void Writer::run() {
   // For PIC code the table base is assigned dynamically by the loader.
   // For non-PIC, we start at 1 so that accessing table index 0 always traps.
-  if (!ctx.isPic) {
-    if (WasmSym::definedTableBase)
-      WasmSym::definedTableBase->setVA(config->tableBase);
-    if (WasmSym::definedTableBase32)
-      WasmSym::definedTableBase32->setVA(config->tableBase);
-  }
+  if (!ctx.isPic && WasmSym::definedTableBase)
+    WasmSym::definedTableBase->setVA(config->tableBase);
 
   log("-- createOutputSegments");
   createOutputSegments();
diff --git a/lldb/cmake/modules/LLDBConfig.cmake b/lldb/cmake/modules/LLDBConfig.cmake
index 3c6223b015bb..6458f2e17464 100644
--- a/lldb/cmake/modules/LLDBConfig.cmake
+++ b/lldb/cmake/modules/LLDBConfig.cmake
@@ -187,24 +187,18 @@ include_directories("${CMAKE_CURRENT_BINARY_DIR}/../clang/include")
 # form -W<foo>, and if supported, add the corresponding -Wno-<foo> option.
 
 # Disable GCC warnings
-check_cxx_compiler_flag("-Wdeprecated-declarations" CXX_SUPPORTS_DEPRECATED_DECLARATIONS)
-append_if(CXX_SUPPORTS_DEPRECATED_DECLARATIONS "-Wno-deprecated-declarations" CMAKE_CXX_FLAGS)
-
-check_cxx_compiler_flag("-Wunknown-pragmas" CXX_SUPPORTS_UNKNOWN_PRAGMAS)
-append_if(CXX_SUPPORTS_UNKNOWN_PRAGMAS "-Wno-unknown-pragmas" CMAKE_CXX_FLAGS)
-
-check_cxx_compiler_flag("-Wstrict-aliasing" CXX_SUPPORTS_STRICT_ALIASING)
-append_if(CXX_SUPPORTS_STRICT_ALIASING "-Wno-strict-aliasing" CMAKE_CXX_FLAGS)
+append("-Wno-deprecated-declarations" CMAKE_CXX_FLAGS)
+append("-Wno-unknown-pragmas" CMAKE_CXX_FLAGS)
+append("-Wno-strict-aliasing" CMAKE_CXX_FLAGS)
 
 check_cxx_compiler_flag("-Wstringop-truncation" CXX_SUPPORTS_STRINGOP_TRUNCATION)
 append_if(CXX_SUPPORTS_STRINGOP_TRUNCATION "-Wno-stringop-truncation" CMAKE_CXX_FLAGS)
 
 # Disable Clang warnings
-check_cxx_compiler_flag("-Wdeprecated-register" CXX_SUPPORTS_DEPRECATED_REGISTER)
-append_if(CXX_SUPPORTS_DEPRECATED_REGISTER "-Wno-deprecated-register" CMAKE_CXX_FLAGS)
-
-check_cxx_compiler_flag("-Wvla-extension" CXX_SUPPORTS_VLA_EXTENSION)
-append_if(CXX_SUPPORTS_VLA_EXTENSION "-Wno-vla-extension" CMAKE_CXX_FLAGS)
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  append("-Wno-deprecated-register" CMAKE_CXX_FLAGS)
+  append("-Wno-vla-extension" CMAKE_CXX_FLAGS)
+endif()
 
 # Disable MSVC warnings
 if( MSVC )
diff --git a/lldb/docs/resources/build.rst b/lldb/docs/resources/build.rst
index 09d3d15a9408..33b6a6f79def 100644
--- a/lldb/docs/resources/build.rst
+++ b/lldb/docs/resources/build.rst
@@ -477,7 +477,6 @@ further by passing the appropriate cmake options, such as:
   -DLLDB_ENABLE_PYTHON=0
   -DLLDB_ENABLE_LIBEDIT=0
   -DLLDB_ENABLE_CURSES=0
-  -DLLVM_ENABLE_TERMINFO=0
 
 (see :ref:`Optional Dependencies` for more)
 
diff --git a/lldb/include/lldb/Symbol/CompilerType.h b/lldb/include/lldb/Symbol/CompilerType.h
index 28c723abf279..70dacdcb7986 100644
--- a/lldb/include/lldb/Symbol/CompilerType.h
+++ b/lldb/include/lldb/Symbol/CompilerType.h
@@ -436,7 +436,7 @@ public:
                                    uint32_t *bitfield_bit_size_ptr = nullptr,
                                    bool *is_bitfield_ptr = nullptr) const;
 
-  CompilerType GetChildCompilerTypeAtIndex(
+  llvm::Expected<CompilerType> GetChildCompilerTypeAtIndex(
       ExecutionContext *exe_ctx, size_t idx, bool transparent_pointers,
       bool omit_empty_base_classes, bool ignore_array_bounds,
       std::string &child_name, uint32_t &child_byte_size,
diff --git a/lldb/include/lldb/Symbol/TypeSystem.h b/lldb/include/lldb/Symbol/TypeSystem.h
index 7bcb8d69387a..b4025c173a18 100644
--- a/lldb/include/lldb/Symbol/TypeSystem.h
+++ b/lldb/include/lldb/Symbol/TypeSystem.h
@@ -359,7 +359,7 @@ public:
     return CompilerDecl();
   }
 
-  virtual CompilerType GetChildCompilerTypeAtIndex(
+  virtual llvm::Expected<CompilerType> GetChildCompilerTypeAtIndex(
       lldb::opaque_compiler_type_t type, ExecutionContext *exe_ctx, size_t idx,
       bool transparent_pointers, bool omit_empty_base_classes,
       bool ignore_array_bounds, std::string &child_name,
diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index aac0cf51680a..637d34c29715 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -915,8 +915,8 @@ public:
   /// \param[in] force_kill
   ///     Whether lldb should force a kill (instead of a detach) from
   ///     the inferior process.  Normally if lldb launched a binary and
-  ///     Destory is called, lldb kills it.  If lldb attached to a
-  ///     running process and Destory is called, lldb detaches.  If
+  ///     Destroy is called, lldb kills it.  If lldb attached to a
+  ///     running process and Destroy is called, lldb detaches.  If
   ///     this behavior needs to be over-ridden, this is the bool that
   ///     can be used.
   ///
diff --git a/lldb/packages/Python/lldbsuite/test/dotest.py b/lldb/packages/Python/lldbsuite/test/dotest.py
index ebabf348643e..2e537e3fd3ce 100644
--- a/lldb/packages/Python/lldbsuite/test/dotest.py
+++ b/lldb/packages/Python/lldbsuite/test/dotest.py
@@ -542,12 +542,6 @@ def setupSysPath():
     lldbDAPExec = os.path.join(lldbDir, "lldb-dap")
     if is_exe(lldbDAPExec):
         os.environ["LLDBDAP_EXEC"] = lldbDAPExec
-    else:
-        if not configuration.shouldSkipBecauseOfCategories(["lldb-dap"]):
-            print(
-                "The 'lldb-dap' executable cannot be located.  The lldb-dap tests can not be run as a result."
-            )
-            configuration.skip_categories.append("lldb-dap")
 
     lldbPythonDir = None  # The directory that contains 'lldb/__init__.py'
 
@@ -929,6 +923,24 @@ def checkPexpectSupport():
         configuration.skip_categories.append("pexpect")
 
 
+def checkDAPSupport():
+    import lldb
+
+    if "LLDBDAP_EXEC" not in os.environ:
+        msg = (
+            "The 'lldb-dap' executable cannot be located and its tests will not be run."
+        )
+    elif lldb.remote_platform:
+        msg = "lldb-dap tests are not compatible with remote platforms and will not be run."
+    else:
+        msg = None
+
+    if msg:
+        if configuration.verbose:
+            print(msg)
+        configuration.skip_categories.append("lldb-dap")
+
+
 def run_suite():
     # On MacOS X, check to make sure that domain for com.apple.DebugSymbols defaults
     # does not exist before proceeding to running the test suite.
@@ -1029,6 +1041,7 @@ def run_suite():
     checkObjcSupport()
     checkForkVForkSupport()
     checkPexpectSupport()
+    checkDAPSupport()
 
     skipped_categories_list = ", ".join(configuration.skip_categories)
     print(
diff --git a/lldb/source/Breakpoint/BreakpointResolverFileLine.cpp b/lldb/source/Breakpoint/BreakpointResolverFileLine.cpp
index d7d8c714867e..16c4ee1b88d1 100644
--- a/lldb/source/Breakpoint/BreakpointResolverFileLine.cpp
+++ b/lldb/source/Breakpoint/BreakpointResolverFileLine.cpp
@@ -198,16 +198,16 @@ void BreakpointResolverFileLine::DeduceSourceMapping(
     return;
 
   Log *log = GetLog(LLDBLog::Breakpoints);
-  const llvm::StringRef path_separator = llvm::sys::path::get_separator(
-      m_location_spec.GetFileSpec().GetPathStyle());
   // Check if "b" is a suffix of "a".
   // And return std::nullopt if not or the new path
   // of "a" after consuming "b" from the back.
   auto check_suffix =
-      [path_separator](llvm::StringRef a, llvm::StringRef b,
-                       bool case_sensitive) -> std::optional<llvm::StringRef> {
+      [](llvm::StringRef a, llvm::StringRef b,
+         bool case_sensitive) -> std::optional<llvm::StringRef> {
     if (case_sensitive ? a.consume_back(b) : a.consume_back_insensitive(b)) {
-      if (a.empty() || a.ends_with(path_separator)) {
+      // Note sc_file_dir and request_file_dir below are normalized
+      // and always contain the path separator '/'.
+      if (a.empty() || a.ends_with("/")) {
         return a;
       }
     }
diff --git a/lldb/source/Commands/CommandObjectThread.cpp b/lldb/source/Commands/CommandObjectThread.cpp
index 4397ee14ea07..db96ee2cec38 100644
--- a/lldb/source/Commands/CommandObjectThread.cpp
+++ b/lldb/source/Commands/CommandObjectThread.cpp
@@ -114,8 +114,8 @@ public:
   CommandObjectThreadBacktrace(CommandInterpreter &interpreter)
       : CommandObjectIterateOverThreads(
             interpreter, "thread backtrace",
-            "Show thread call stacks.  Defaults to the current thread, thread "
-            "indexes can be specified as arguments.\n"
+            "Show backtraces of thread call stacks.  Defaults to the current "
+            "thread, thread indexes can be specified as arguments.\n"
             "Use the thread-index \"all\" to see all threads.\n"
             "Use the thread-index \"unique\" to see threads grouped by unique "
             "call stacks.\n"
diff --git a/lldb/source/Core/CMakeLists.txt b/lldb/source/Core/CMakeLists.txt
index 10525ac39e6e..f24dbbd45a8e 100644
--- a/lldb/source/Core/CMakeLists.txt
+++ b/lldb/source/Core/CMakeLists.txt
@@ -11,9 +11,6 @@ set(LLDB_LIBEDIT_LIBS)
 
 if (LLDB_ENABLE_CURSES)
   list(APPEND LLDB_CURSES_LIBS ${PANEL_LIBRARIES} ${CURSES_LIBRARIES})
-  if(LLVM_ENABLE_TERMINFO)
-    list(APPEND LLDB_CURSES_LIBS ${Terminfo_LIBRARIES})
-  endif()
   if (LLVM_BUILD_STATIC)
     list(APPEND LLDB_CURSES_LIBS gpm)
   endif()
diff --git a/lldb/source/Core/ValueObject.cpp b/lldb/source/Core/ValueObject.cpp
index f39bd07a2553..1443d9dfc328 100644
--- a/lldb/source/Core/ValueObject.cpp
+++ b/lldb/source/Core/ValueObject.cpp
@@ -505,15 +505,23 @@ ValueObject *ValueObject::CreateChildAtIndex(size_t idx,
   uint64_t language_flags = 0;
 
   const bool transparent_pointers = !synthetic_array_member;
-  CompilerType child_compiler_type;
 
   ExecutionContext exe_ctx(GetExecutionContextRef());
 
-  child_compiler_type = GetCompilerType().GetChildCompilerTypeAtIndex(
-      &exe_ctx, idx, transparent_pointers, omit_empty_base_classes,
-      ignore_array_bounds, child_name_str, child_byte_size, child_byte_offset,
-      child_bitfield_bit_size, child_bitfield_bit_offset, child_is_base_class,
-      child_is_deref_of_parent, this, language_flags);
+  auto child_compiler_type_or_err =
+      GetCompilerType().GetChildCompilerTypeAtIndex(
+          &exe_ctx, idx, transparent_pointers, omit_empty_base_classes,
+          ignore_array_bounds, child_name_str, child_byte_size,
+          child_byte_offset, child_bitfield_bit_size, child_bitfield_bit_offset,
+          child_is_base_class, child_is_deref_of_parent, this, language_flags);
+  CompilerType child_compiler_type;
+  if (!child_compiler_type_or_err)
+    LLDB_LOG_ERROR(GetLog(LLDBLog::Types),
+                   child_compiler_type_or_err.takeError(),
+                   "could not find child: {0}");
+  else
+    child_compiler_type = *child_compiler_type_or_err;
+
   if (child_compiler_type) {
     if (synthetic_index)
       child_byte_offset += child_byte_size * synthetic_index;
@@ -2624,16 +2632,23 @@ ValueObjectSP ValueObject::Dereference(Status &error) {
     bool child_is_deref_of_parent = false;
     const bool transparent_pointers = false;
     CompilerType compiler_type = GetCompilerType();
-    CompilerType child_compiler_type;
     uint64_t language_flags = 0;
 
     ExecutionContext exe_ctx(GetExecutionContextRef());
 
-    child_compiler_type = compiler_type.GetChildCompilerTypeAtIndex(
+    CompilerType child_compiler_type;
+    auto child_compiler_type_or_err = compiler_type.GetChildCompilerTypeAtIndex(
         &exe_ctx, 0, transparent_pointers, omit_empty_base_classes,
         ignore_array_bounds, child_name_str, child_byte_size, child_byte_offset,
         child_bitfield_bit_size, child_bitfield_bit_offset, child_is_base_class,
         child_is_deref_of_parent, this, language_flags);
+    if (!child_compiler_type_or_err)
+      LLDB_LOG_ERROR(GetLog(LLDBLog::Types),
+                     child_compiler_type_or_err.takeError(),
+                     "could not find child: {0}");
+    else
+      child_compiler_type = *child_compiler_type_or_err;
+
     if (child_compiler_type && child_byte_size) {
       ConstString child_name;
       if (!child_name_str.empty())
diff --git a/lldb/source/Core/ValueObjectConstResultImpl.cpp b/lldb/source/Core/ValueObjectConstResultImpl.cpp
index e2db3ace1924..493980d7ea96 100644
--- a/lldb/source/Core/ValueObjectConstResultImpl.cpp
+++ b/lldb/source/Core/ValueObjectConstResultImpl.cpp
@@ -17,6 +17,8 @@
 #include "lldb/Target/ExecutionContext.h"
 #include "lldb/Utility/DataBufferHeap.h"
 #include "lldb/Utility/Endian.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
 #include "lldb/Utility/Scalar.h"
 
 #include <string>
@@ -66,15 +68,21 @@ ValueObject *ValueObjectConstResultImpl::CreateChildAtIndex(
 
   const bool transparent_pointers = !synthetic_array_member;
   CompilerType compiler_type = m_impl_backend->GetCompilerType();
-  CompilerType child_compiler_type;
 
   ExecutionContext exe_ctx(m_impl_backend->GetExecutionContextRef());
 
-  child_compiler_type = compiler_type.GetChildCompilerTypeAtIndex(
+  auto child_compiler_type_or_err = compiler_type.GetChildCompilerTypeAtIndex(
       &exe_ctx, idx, transparent_pointers, omit_empty_base_classes,
       ignore_array_bounds, child_name_str, child_byte_size, child_byte_offset,
       child_bitfield_bit_size, child_bitfield_bit_offset, child_is_base_class,
       child_is_deref_of_parent, m_impl_backend, language_flags);
+  CompilerType child_compiler_type;
+  if (!child_compiler_type_or_err)
+    LLDB_LOG_ERROR(GetLog(LLDBLog::Types),
+                   child_compiler_type_or_err.takeError(),
+                   "could not find child: {0}");
+  else
+    child_compiler_type = *child_compiler_type_or_err;
 
   // One might think we should check that the size of the children
   // is always strictly positive, hence we could avoid creating a
diff --git a/lldb/source/Host/common/Socket.cpp b/lldb/source/Host/common/Socket.cpp
index bd0c127a0895..f9911cf136cb 100644
--- a/lldb/source/Host/common/Socket.cpp
+++ b/lldb/source/Host/common/Socket.cpp
@@ -87,8 +87,7 @@ llvm::Error Socket::Initialize() {
   if (err == 0) {
     if (wsaData.wVersion < wVersion) {
       WSACleanup();
-      return llvm::make_error<llvm::StringError>(
-          "WSASock version is not expected.", llvm::inconvertibleErrorCode());
+      return llvm::createStringError("WSASock version is not expected.");
     }
   } else {
     return llvm::errorCodeToError(llvm::mapWindowsError(::WSAGetLastError()));
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index 811726e30af4..7f21f382adb8 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -828,11 +828,11 @@ void CommandInterpreter::LoadCommandDictionary() {
   std::unique_ptr<CommandObjectRegexCommand> bt_regex_cmd_up(
       new CommandObjectRegexCommand(
           *this, "_regexp-bt",
-          "Show the current thread's call stack.  Any numeric argument "
-          "displays at most that many "
-          "frames.  The argument 'all' displays all threads.  Use 'settings"
-          " set frame-format' to customize the printing of individual frames "
-          "and 'settings set thread-format' to customize the thread header.",
+          "Show backtrace of the current thread's call stack.  Any numeric "
+          "argument displays at most that many frames.  The argument 'all' "
+          "displays all threads.  Use 'settings set frame-format' to customize "
+          "the printing of individual frames and 'settings set thread-format' "
+          "to customize the thread header.",
           "bt [<digit> | all]", 0, false));
   if (bt_regex_cmd_up) {
     // accept but don't document "bt -c <number>" -- before bt was a regex
diff --git a/lldb/source/Interpreter/Options.cpp b/lldb/source/Interpreter/Options.cpp
index 51b7e6b26b6e..4e7d074ace1b 100644
--- a/lldb/source/Interpreter/Options.cpp
+++ b/lldb/source/Interpreter/Options.cpp
@@ -931,8 +931,7 @@ llvm::Expected<Args> Options::ParseAlias(const Args &args,
   Option *long_options = GetLongOptions();
 
   if (long_options == nullptr) {
-    return llvm::make_error<llvm::StringError>("Invalid long options",
-                                               llvm::inconvertibleErrorCode());
+    return llvm::createStringError("Invalid long options");
   }
 
   std::string short_options = BuildShortOptions(long_options);
@@ -957,8 +956,7 @@ llvm::Expected<Args> Options::ParseAlias(const Args &args,
       break;
 
     if (val == '?') {
-      return llvm::make_error<llvm::StringError>(
-          "Unknown or ambiguous option", llvm::inconvertibleErrorCode());
+      return llvm::createStringError("Unknown or ambiguous option");
     }
 
     if (val == 0)
@@ -980,9 +978,8 @@ llvm::Expected<Args> Options::ParseAlias(const Args &args,
 
     // See if the option takes an argument, and see if one was supplied.
     if (long_options_index == -1) {
-      return llvm::make_error<llvm::StringError>(
-          llvm::formatv("Invalid option with value '{0}'.", char(val)).str(),
-          llvm::inconvertibleErrorCode());
+      return llvm::createStringError(
+          llvm::formatv("Invalid option with value '{0}'.", char(val)).str());
     }
 
     StreamString option_str;
@@ -995,11 +992,10 @@ llvm::Expected<Args> Options::ParseAlias(const Args &args,
     switch (has_arg) {
     case OptionParser::eRequiredArgument:
       if (OptionParser::GetOptionArgument() == nullptr) {
-        return llvm::make_error<llvm::StringError>(
+        return llvm::createStringError(
             llvm::formatv("Option '{0}' is missing argument specifier.",
                           option_str.GetString())
-                .str(),
-            llvm::inconvertibleErrorCode());
+                .str());
       }
       [[fallthrough]];
     case OptionParser::eOptionalArgument:
@@ -1008,12 +1004,11 @@ llvm::Expected<Args> Options::ParseAlias(const Args &args,
     case OptionParser::eNoArgument:
       break;
     default:
-      return llvm::make_error<llvm::StringError>(
+      return llvm::createStringError(
           llvm::formatv("error with options table; invalid value in has_arg "
                         "field for option '{0}'.",
                         char(val))
-              .str(),
-          llvm::inconvertibleErrorCode());
+              .str());
     }
     // Find option in the argument list; also see if it was supposed to take an
     // argument and if one was supplied.  Remove option (and argument, if
@@ -1261,8 +1256,7 @@ llvm::Expected<Args> Options::Parse(const Args &args,
   Status error;
   Option *long_options = GetLongOptions();
   if (long_options == nullptr) {
-    return llvm::make_error<llvm::StringError>("Invalid long options.",
-                                               llvm::inconvertibleErrorCode());
+    return llvm::createStringError("Invalid long options.");
   }
 
   std::string short_options = BuildShortOptions(long_options);
@@ -1322,9 +1316,8 @@ llvm::Expected<Args> Options::Parse(const Args &args,
       if (!platform_sp && require_validation) {
         // Caller requires validation but we cannot validate as we don't have
         // the mandatory platform against which to validate.
-        return llvm::make_error<llvm::StringError>(
-            "cannot validate options: no platform available",
-            llvm::inconvertibleErrorCode());
+        return llvm::createStringError(
+            "cannot validate options: no platform available");
       }
 
       bool validation_failed = false;
diff --git a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp
index 173b5613d1b8..eac058701313 100644
--- a/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp
+++ b/lldb/source/Plugins/ABI/PowerPC/ABISysV_ppc64.cpp
@@ -501,14 +501,12 @@ public:
                                                      CompilerType &type) {
     RegisterContext *reg_ctx = thread.GetRegisterContext().get();
     if (!reg_ctx)
-      return llvm::make_error<llvm::StringError>(
-          LOG_PREFIX "Failed to get RegisterContext",
-          llvm::inconvertibleErrorCode());
+      return llvm::createStringError(LOG_PREFIX
+                                     "Failed to get RegisterContext");
 
     ProcessSP process_sp = thread.GetProcess();
     if (!process_sp)
-      return llvm::make_error<llvm::StringError>(
-          LOG_PREFIX "GetProcess() failed", llvm::inconvertibleErrorCode());
+      return llvm::createStringError(LOG_PREFIX "GetProcess() failed");
 
     return ReturnValueExtractor(thread, type, reg_ctx, process_sp);
   }
@@ -836,7 +834,7 @@ private:
     for (uint32_t i = 0; i < n; i++) {
       std::string name;
       uint32_t size;
-      GetChildType(i, name, size);
+      (void)GetChildType(i, name, size);
       // NOTE: the offset returned by GetChildCompilerTypeAtIndex()
       //       can't be used because it never considers alignment bytes
       //       between struct fields.
@@ -903,7 +901,8 @@ private:
   }
 
   // get child
-  CompilerType GetChildType(uint32_t i, std::string &name, uint32_t &size) {
+  llvm::Expected<CompilerType> GetChildType(uint32_t i, std::string &name,
+                                            uint32_t &size) {
     // GetChild constant inputs
     const bool transparent_pointers = false;
     const bool omit_empty_base_classes = true;
diff --git a/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp b/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp
index 9a6e135e0083..2c9b3c425397 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/BlockPointer.cpp
@@ -12,6 +12,7 @@
 #include "Plugins/ExpressionParser/Clang/ClangPersistentVariables.h"
 #include "Plugins/TypeSystem/Clang/TypeSystemClang.h"
 #include "lldb/Core/ValueObject.h"
+#include "lldb/Core/ValueObjectConstResult.h"
 #include "lldb/DataFormatters/FormattersHelpers.h"
 #include "lldb/Symbol/CompilerType.h"
 #include "lldb/Symbol/TypeSystem.h"
@@ -105,13 +106,16 @@ public:
     bool child_is_deref_of_parent = false;
     uint64_t language_flags = 0;
 
-    const CompilerType child_type =
-        m_block_struct_type.GetChildCompilerTypeAtIndex(
-            &exe_ctx, idx, transparent_pointers, omit_empty_base_classes,
-            ignore_array_bounds, child_name, child_byte_size, child_byte_offset,
-            child_bitfield_bit_size, child_bitfield_bit_offset,
-            child_is_base_class, child_is_deref_of_parent, value_object,
-            language_flags);
+    auto child_type_or_err = m_block_struct_type.GetChildCompilerTypeAtIndex(
+        &exe_ctx, idx, transparent_pointers, omit_empty_base_classes,
+        ignore_array_bounds, child_name, child_byte_size, child_byte_offset,
+        child_bitfield_bit_size, child_bitfield_bit_offset, child_is_base_class,
+        child_is_deref_of_parent, value_object, language_flags);
+    if (!child_type_or_err)
+      return ValueObjectConstResult::Create(
+          exe_ctx.GetBestExecutionContextScope(),
+          Status(child_type_or_err.takeError()));
+    CompilerType child_type = *child_type_or_err;
 
     ValueObjectSP struct_pointer_sp =
         m_backend.Cast(m_block_struct_type.GetPointerType());
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
index ec5b320e2218..0929d49e55ea 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
@@ -295,13 +295,13 @@ void lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd::GetValueOffset(
     bool child_is_base_class;
     bool child_is_deref_of_parent;
     uint64_t language_flags;
-    if (tree_node_type
-            .GetChildCompilerTypeAtIndex(
-                nullptr, 4, true, true, true, child_name, child_byte_size,
-                child_byte_offset, child_bitfield_bit_size,
-                child_bitfield_bit_offset, child_is_base_class,
-                child_is_deref_of_parent, nullptr, language_flags)
-            .IsValid())
+    auto child_type =
+        llvm::expectedToStdOptional(tree_node_type.GetChildCompilerTypeAtIndex(
+            nullptr, 4, true, true, true, child_name, child_byte_size,
+            child_byte_offset, child_bitfield_bit_size,
+            child_bitfield_bit_offset, child_is_base_class,
+            child_is_deref_of_parent, nullptr, language_flags));
+    if (child_type && child_type->IsValid())
       m_skip_size = (uint32_t)child_byte_offset;
   }
 }
diff --git a/lldb/source/Plugins/Process/NetBSD/NativeThreadNetBSD.cpp b/lldb/source/Plugins/Process/NetBSD/NativeThreadNetBSD.cpp
index f561c21b9d91..77b4301ea22e 100644
--- a/lldb/source/Plugins/Process/NetBSD/NativeThreadNetBSD.cpp
+++ b/lldb/source/Plugins/Process/NetBSD/NativeThreadNetBSD.cpp
@@ -180,8 +180,6 @@ void NativeThreadNetBSD::SetStepping() {
 }
 
 std::string NativeThreadNetBSD::GetName() {
-  Log *log = GetLog(POSIXLog::Thread);
-
 #ifdef PT_LWPSTATUS
   struct ptrace_lwpstatus info = {};
   info.pl_lwpid = m_tid;
@@ -193,6 +191,8 @@ std::string NativeThreadNetBSD::GetName() {
   return info.pl_name;
 #else
   std::vector<struct kinfo_lwp> infos;
+  Log *log = GetLog(POSIXLog::Thread);
+
   int mib[5] = {CTL_KERN, KERN_LWP, static_cast<int>(m_process.GetID()),
                 sizeof(struct kinfo_lwp), 0};
   size_t size;
diff --git a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
index 36812c27a5b6..30af9345999c 100644
--- a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
+++ b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
@@ -250,6 +250,9 @@ Status ProcessElfCore::DoLoadCore() {
     }
   }
 
+  // Try to find gnu build id before we load the executable.
+  UpdateBuildIdForNTFileEntries();
+
   // Core files are useless without the main executable. See if we can locate
   // the main executable using data we found in the core file notes.
   lldb::ModuleSP exe_module_sp = GetTarget().GetExecutableModule();
@@ -258,6 +261,7 @@ Status ProcessElfCore::DoLoadCore() {
     if (!m_nt_file_entries.empty()) {
       ModuleSpec exe_module_spec;
       exe_module_spec.GetArchitecture() = arch;
+      exe_module_spec.GetUUID() = m_nt_file_entries[0].uuid;
       exe_module_spec.GetFileSpec().SetFile(m_nt_file_entries[0].path,
                                             FileSpec::Style::native);
       if (exe_module_spec.GetFileSpec()) {
@@ -271,6 +275,12 @@ Status ProcessElfCore::DoLoadCore() {
   return error;
 }
 
+void ProcessElfCore::UpdateBuildIdForNTFileEntries() {
+  for (NT_FILE_Entry &entry : m_nt_file_entries) {
+    entry.uuid = FindBuidIdInCoreMemory(entry.start);
+  }
+}
+
 lldb_private::DynamicLoader *ProcessElfCore::GetDynamicLoader() {
   if (m_dyld_up.get() == nullptr)
     m_dyld_up.reset(DynamicLoader::FindPlugin(
@@ -983,6 +993,67 @@ llvm::Error ProcessElfCore::ParseThreadContextsFromNoteSegment(
   }
 }
 
+UUID ProcessElfCore::FindBuidIdInCoreMemory(lldb::addr_t address) {
+  UUID invalid_uuid;
+  const uint32_t addr_size = GetAddressByteSize();
+  const size_t elf_header_size = addr_size == 4 ? sizeof(llvm::ELF::Elf32_Ehdr)
+                                                : sizeof(llvm::ELF::Elf64_Ehdr);
+
+  std::vector<uint8_t> elf_header_bytes;
+  elf_header_bytes.resize(elf_header_size);
+  Status error;
+  size_t byte_read =
+      ReadMemory(address, elf_header_bytes.data(), elf_header_size, error);
+  if (byte_read != elf_header_size ||
+      !elf::ELFHeader::MagicBytesMatch(elf_header_bytes.data()))
+    return invalid_uuid;
+  DataExtractor elf_header_data(elf_header_bytes.data(), elf_header_size,
+                                GetByteOrder(), addr_size);
+  lldb::offset_t offset = 0;
+
+  elf::ELFHeader elf_header;
+  elf_header.Parse(elf_header_data, &offset);
+
+  const lldb::addr_t ph_addr = address + elf_header.e_phoff;
+
+  std::vector<uint8_t> ph_bytes;
+  ph_bytes.resize(elf_header.e_phentsize);
+  for (unsigned int i = 0; i < elf_header.e_phnum; ++i) {
+    byte_read = ReadMemory(ph_addr + i * elf_header.e_phentsize,
+                           ph_bytes.data(), elf_header.e_phentsize, error);
+    if (byte_read != elf_header.e_phentsize)
+      break;
+    DataExtractor program_header_data(ph_bytes.data(), elf_header.e_phentsize,
+                                      GetByteOrder(), addr_size);
+    offset = 0;
+    elf::ELFProgramHeader program_header;
+    program_header.Parse(program_header_data, &offset);
+    if (program_header.p_type != llvm::ELF::PT_NOTE)
+      continue;
+
+    std::vector<uint8_t> note_bytes;
+    note_bytes.resize(program_header.p_memsz);
+
+    byte_read = ReadMemory(program_header.p_vaddr, note_bytes.data(),
+                           program_header.p_memsz, error);
+    if (byte_read != program_header.p_memsz)
+      continue;
+    DataExtractor segment_data(note_bytes.data(), note_bytes.size(),
+                               GetByteOrder(), addr_size);
+    auto notes_or_error = parseSegment(segment_data);
+    if (!notes_or_error)
+      return invalid_uuid;
+    for (const CoreNote &note : *notes_or_error) {
+      if (note.info.n_namesz == 4 &&
+          note.info.n_type == llvm::ELF::NT_GNU_BUILD_ID &&
+          "GNU" == note.info.n_name &&
+          note.data.ValidOffsetForDataOfSize(0, note.info.n_descsz))
+        return UUID(note.data.GetData().take_front(note.info.n_descsz));
+    }
+  }
+  return invalid_uuid;
+}
+
 uint32_t ProcessElfCore::GetNumThreadContexts() {
   if (!m_thread_data_valid)
     DoLoadCore();
diff --git a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.h b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.h
index 2cec635bbacf..668a7c484674 100644
--- a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.h
+++ b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.h
@@ -117,6 +117,10 @@ private:
     lldb::addr_t end;
     lldb::addr_t file_ofs;
     std::string path;
+    // Add a UUID member for convenient access. The UUID value is not in the
+    // NT_FILE entries, we will find it in core memory and store it here for
+    // easy access.
+    lldb_private::UUID uuid;
   };
 
   // For ProcessElfCore only
@@ -158,6 +162,12 @@ private:
   // Returns number of thread contexts stored in the core file
   uint32_t GetNumThreadContexts();
 
+  // Populate gnu uuid for each NT_FILE entry
+  void UpdateBuildIdForNTFileEntries();
+
+  // Returns the value of certain type of note of a given start address
+  lldb_private::UUID FindBuidIdInCoreMemory(lldb::addr_t address);
+
   // Parse a contiguous address range of the process from LOAD segment
   lldb::addr_t
   AddAddressRangeFromLoadSegment(const elf::ELFProgramHeader &header);
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index ce52f3595247..6e676de146b3 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -2494,8 +2494,7 @@ bool ScriptInterpreterPythonImpl::LoadScriptingModule(
 
   auto ExtendSysPath = [&](std::string directory) -> llvm::Error {
     if (directory.empty()) {
-      return llvm::make_error<llvm::StringError>(
-          "invalid directory name", llvm::inconvertibleErrorCode());
+      return llvm::createStringError("invalid directory name");
     }
 
     replace_all(directory, "\\", "\\\\");
@@ -2508,10 +2507,8 @@ bool ScriptInterpreterPythonImpl::LoadScriptingModule(
                           directory.c_str(), directory.c_str());
     bool syspath_retval =
         ExecuteMultipleLines(command_stream.GetData(), exc_options).Success();
-    if (!syspath_retval) {
-      return llvm::make_error<llvm::StringError>(
-          "Python sys.path handling failed", llvm::inconvertibleErrorCode());
-    }
+    if (!syspath_retval)
+      return llvm::createStringError("Python sys.path handling failed");
 
     return llvm::Error::success();
   };
diff --git a/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.h b/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.h
index 83215bf3c87e..041b388f9f34 100644
--- a/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.h
+++ b/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.h
@@ -120,9 +120,8 @@ public:
 
   llvm::Expected<lldb::TypeSystemSP>
   GetTypeSystemForLanguage(lldb::LanguageType language) override {
-    return llvm::make_error<llvm::StringError>(
-        "SymbolFileBreakpad does not support GetTypeSystemForLanguage",
-        llvm::inconvertibleErrorCode());
+    return llvm::createStringError(
+        "SymbolFileBreakpad does not support GetTypeSystemForLanguage");
   }
 
   CompilerDeclContext FindNamespace(ConstString name,
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
index 1b0fefedf983..688a287a0650 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
@@ -11,6 +11,7 @@
 #include <cassert>
 
 #include <algorithm>
+#include <limits>
 #include <optional>
 
 #include "llvm/Support/LEB128.h"
@@ -41,13 +42,23 @@ extern int g_verbose;
 // Extract a debug info entry for a given DWARFUnit from the data
 // starting at the offset in offset_ptr
 bool DWARFDebugInfoEntry::Extract(const DWARFDataExtractor &data,
-                                  const DWARFUnit *cu,
+                                  const DWARFUnit &unit,
                                   lldb::offset_t *offset_ptr) {
   m_offset = *offset_ptr;
+  auto report_error = [&](const char *fmt, const auto &...vals) {
+    unit.GetSymbolFileDWARF().GetObjectFile()->GetModule()->ReportError(
+        "[{0:x16}]: {1}, please file a bug and "
+        "attach the file at the start of this error message",
+        static_cast<uint64_t>(m_offset), llvm::formatv(fmt, vals...));
+    *offset_ptr = std::numeric_limits<lldb::offset_t>::max();
+    return false;
+  };
+
   m_parent_idx = 0;
   m_sibling_idx = 0;
   const uint64_t abbr_idx = data.GetULEB128(offset_ptr);
-  lldbassert(abbr_idx <= UINT16_MAX);
+  if (abbr_idx > std::numeric_limits<uint16_t>::max())
+    return report_error("abbreviation code {0} too big", abbr_idx);
   m_abbr_idx = abbr_idx;
 
   if (m_abbr_idx == 0) {
@@ -56,31 +67,18 @@ bool DWARFDebugInfoEntry::Extract(const DWARFDataExtractor &data,
     return true; // NULL debug tag entry
   }
 
-  const auto *abbrevDecl = GetAbbreviationDeclarationPtr(cu);
-  if (abbrevDecl == nullptr) {
-    cu->GetSymbolFileDWARF().GetObjectFile()->GetModule()->ReportError(
-        "[{0:x16}]: invalid abbreviation code {1}, "
-        "please file a bug and "
-        "attach the file at the start of this error message",
-        (uint64_t)m_offset, (unsigned)abbr_idx);
-    // WE can't parse anymore if the DWARF is borked...
-    *offset_ptr = UINT32_MAX;
-    return false;
-  }
+  const auto *abbrevDecl = GetAbbreviationDeclarationPtr(&unit);
+  if (abbrevDecl == nullptr)
+    return report_error("invalid abbreviation code {0}", abbr_idx);
+
   m_tag = abbrevDecl->getTag();
   m_has_children = abbrevDecl->hasChildren();
   // Skip all data in the .debug_info or .debug_types for the attributes
   for (const auto &attribute : abbrevDecl->attributes()) {
-    if (DWARFFormValue::SkipValue(attribute.Form, data, offset_ptr, cu))
+    if (DWARFFormValue::SkipValue(attribute.Form, data, offset_ptr, &unit))
       continue;
 
-    cu->GetSymbolFileDWARF().GetObjectFile()->GetModule()->ReportError(
-        "[{0:x16}]: Unsupported DW_FORM_{1:x}, please file a bug "
-        "and "
-        "attach the file at the start of this error message",
-        (uint64_t)m_offset, (unsigned)attribute.Form);
-    *offset_ptr = m_offset;
-    return false;
+    return report_error("Unsupported DW_FORM_{1:x}", attribute.Form);
   }
   return true;
 }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h
index c19fa7428549..6773b00e8206 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h
@@ -49,7 +49,7 @@ public:
   void BuildFunctionAddressRangeTable(DWARFUnit *cu,
                                       DWARFDebugAranges *debug_aranges) const;
 
-  bool Extract(const DWARFDataExtractor &data, const DWARFUnit *cu,
+  bool Extract(const DWARFDataExtractor &data, const DWARFUnit &cu,
                lldb::offset_t *offset_ptr);
 
   using Recurse = DWARFBaseDIE::Recurse;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
index 3a57ec970b07..66a762bf9b68 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
@@ -63,7 +63,7 @@ void DWARFUnit::ExtractUnitDIENoDwoIfNeeded() {
   // parse
   const DWARFDataExtractor &data = GetData();
   if (offset < GetNextUnitOffset() &&
-      m_first_die.Extract(data, this, &offset)) {
+      m_first_die.Extract(data, *this, &offset)) {
     AddUnitDIE(m_first_die);
     return;
   }
@@ -242,7 +242,7 @@ void DWARFUnit::ExtractDIEsRWLocked() {
   die_index_stack.reserve(32);
   die_index_stack.push_back(0);
   bool prev_die_had_children = false;
-  while (offset < next_cu_offset && die.Extract(data, this, &offset)) {
+  while (offset < next_cu_offset && die.Extract(data, *this, &offset)) {
     const bool null_die = die.IsNULL();
     if (depth == 0) {
       assert(m_die_array.empty() && "Compile unit DIE already added");
@@ -670,7 +670,7 @@ DWARFUnit::GetDIE(dw_offset_t die_offset) {
 
 llvm::StringRef DWARFUnit::PeekDIEName(dw_offset_t die_offset) {
   DWARFDebugInfoEntry die;
-  if (!die.Extract(GetData(), this, &die_offset))
+  if (!die.Extract(GetData(), *this, &die_offset))
     return llvm::StringRef();
 
   // Does die contain a DW_AT_Name?
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 582d9eac3e1d..369ae46cf264 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -5272,8 +5272,7 @@ TypeSystemClang::GetNumChildren(lldb::opaque_compiler_type_t type,
                                 bool omit_empty_base_classes,
                                 const ExecutionContext *exe_ctx) {
   if (!type)
-    return llvm::make_error<llvm::StringError>("invalid clang type",
-                                               llvm::inconvertibleErrorCode());
+    return llvm::createStringError("invalid clang type");
 
   uint32_t num_children = 0;
   clang::QualType qual_type(RemoveWrappingTypes(GetQualType(type)));
@@ -5331,9 +5330,8 @@ TypeSystemClang::GetNumChildren(lldb::opaque_compiler_type_t type,
       num_children += std::distance(record_decl->field_begin(),
                                record_decl->field_end());
     } else
-      return llvm::make_error<llvm::StringError>(
-          "incomplete type \"" + GetDisplayTypeName(type).GetString() + "\"",
-          llvm::inconvertibleErrorCode());
+      return llvm::createStringError(
+          "incomplete type \"" + GetDisplayTypeName(type).GetString() + "\"");
     break;
   case clang::Type::ObjCObject:
   case clang::Type::ObjCInterface:
@@ -6130,7 +6128,7 @@ uint32_t TypeSystemClang::GetNumPointeeChildren(clang::QualType type) {
   return 0;
 }
 
-CompilerType TypeSystemClang::GetChildCompilerTypeAtIndex(
+llvm::Expected<CompilerType> TypeSystemClang::GetChildCompilerTypeAtIndex(
     lldb::opaque_compiler_type_t type, ExecutionContext *exe_ctx, size_t idx,
     bool transparent_pointers, bool omit_empty_base_classes,
     bool ignore_array_bounds, std::string &child_name,
@@ -6156,11 +6154,8 @@ CompilerType TypeSystemClang::GetChildCompilerTypeAtIndex(
 
   auto num_children_or_err =
       GetNumChildren(type, omit_empty_base_classes, exe_ctx);
-  if (!num_children_or_err) {
-    LLDB_LOG_ERRORV(GetLog(LLDBLog::Types), num_children_or_err.takeError(),
-                    "{0}");
-    return {};
-  }
+  if (!num_children_or_err)
+    return num_children_or_err.takeError();
 
   const bool idx_is_valid = idx < *num_children_or_err;
   int32_t bit_offset;
@@ -6242,7 +6237,8 @@ CompilerType TypeSystemClang::GetChildCompilerTypeAtIndex(
             std::optional<uint64_t> size =
                 base_class_clang_type.GetBitSize(get_exe_scope());
             if (!size)
-              return {};
+              return llvm::createStringError("no size info for base class");
+
             uint64_t base_class_clang_type_bit_size = *size;
 
             // Base classes bit sizes should be a multiple of 8 bits in size
@@ -6274,7 +6270,8 @@ CompilerType TypeSystemClang::GetChildCompilerTypeAtIndex(
           std::optional<uint64_t> size =
               field_clang_type.GetByteSize(get_exe_scope());
           if (!size)
-            return {};
+            return llvm::createStringError("no size info for field");
+
           child_byte_size = *size;
           const uint32_t child_bit_size = child_byte_size * 8;
 
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
index 042379d40bcb..d67b7a4c9fe7 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
@@ -887,7 +887,7 @@ public:
 
   static uint32_t GetNumPointeeChildren(clang::QualType type);
 
-  CompilerType GetChildCompilerTypeAtIndex(
+  llvm::Expected<CompilerType> GetChildCompilerTypeAtIndex(
       lldb::opaque_compiler_type_t type, ExecutionContext *exe_ctx, size_t idx,
       bool transparent_pointers, bool omit_empty_base_classes,
       bool ignore_array_bounds, std::string &child_name,
diff --git a/lldb/source/Symbol/CompilerType.cpp b/lldb/source/Symbol/CompilerType.cpp
index 072dbccec44f..f8da9ef7b764 100644
--- a/lldb/source/Symbol/CompilerType.cpp
+++ b/lldb/source/Symbol/CompilerType.cpp
@@ -805,8 +805,7 @@ CompilerType::GetNumChildren(bool omit_empty_base_classes,
     if (auto type_system_sp = GetTypeSystem())
       return type_system_sp->GetNumChildren(m_type, omit_empty_base_classes,
                                        exe_ctx);
-  return llvm::make_error<llvm::StringError>("invalid type",
-                                             llvm::inconvertibleErrorCode());
+  return llvm::createStringError("invalid type");
 }
 
 lldb::BasicType CompilerType::GetBasicTypeEnumeration() const {
@@ -902,7 +901,7 @@ uint32_t CompilerType::GetIndexOfFieldWithName(
   return UINT32_MAX;
 }
 
-CompilerType CompilerType::GetChildCompilerTypeAtIndex(
+llvm::Expected<CompilerType> CompilerType::GetChildCompilerTypeAtIndex(
     ExecutionContext *exe_ctx, size_t idx, bool transparent_pointers,
     bool omit_empty_base_classes, bool ignore_array_bounds,
     std::string &child_name, uint32_t &child_byte_size,
diff --git a/lldb/source/Symbol/Symbol.cpp b/lldb/source/Symbol/Symbol.cpp
index 1895f299cc06..9b0042ffdb4b 100644
--- a/lldb/source/Symbol/Symbol.cpp
+++ b/lldb/source/Symbol/Symbol.cpp
@@ -101,18 +101,15 @@ const Symbol &Symbol::operator=(const Symbol &rhs) {
 llvm::Expected<Symbol> Symbol::FromJSON(const JSONSymbol &symbol,
                                         SectionList *section_list) {
   if (!section_list)
-    return llvm::make_error<llvm::StringError>("no section list provided",
-                                               llvm::inconvertibleErrorCode());
+    return llvm::createStringError("no section list provided");
 
   if (!symbol.value && !symbol.address)
-    return llvm::make_error<llvm::StringError>(
-        "symbol must contain either a value or an address",
-        llvm::inconvertibleErrorCode());
+    return llvm::createStringError(
+        "symbol must contain either a value or an address");
 
   if (symbol.value && symbol.address)
-    return llvm::make_error<llvm::StringError>(
-        "symbol cannot contain both a value and an address",
-        llvm::inconvertibleErrorCode());
+    return llvm::createStringError(
+        "symbol cannot contain both a value and an address");
 
   const uint64_t size = symbol.size.value_or(0);
   const bool is_artificial = false;
@@ -133,9 +130,8 @@ llvm::Expected<Symbol> Symbol::FromJSON(const JSONSymbol &symbol,
                     AddressRange(section_sp, offset, size), size_is_valid,
                     contains_linker_annotations, flags);
     }
-    return llvm::make_error<llvm::StringError>(
-        llvm::formatv("no section found for address: {0:x}", *symbol.address),
-        llvm::inconvertibleErrorCode());
+    return llvm::createStringError(
+        llvm::formatv("no section found for address: {0:x}", *symbol.address));
   }
 
   // Absolute symbols encode the integer value in the m_offset of the
diff --git a/lldb/source/Symbol/SymbolFileOnDemand.cpp b/lldb/source/Symbol/SymbolFileOnDemand.cpp
index c6d9f0071c39..0cfe9fc1514b 100644
--- a/lldb/source/Symbol/SymbolFileOnDemand.cpp
+++ b/lldb/source/Symbol/SymbolFileOnDemand.cpp
@@ -457,9 +457,8 @@ SymbolFileOnDemand::GetTypeSystemForLanguage(LanguageType language) {
     Log *log = GetLog();
     LLDB_LOG(log, "[{0}] {1} is skipped for language type {2}",
              GetSymbolFileName(), __FUNCTION__, language);
-    return llvm::make_error<llvm::StringError>(
-        "GetTypeSystemForLanguage is skipped by SymbolFileOnDemand",
-        llvm::inconvertibleErrorCode());
+    return llvm::createStringError(
+        "GetTypeSystemForLanguage is skipped by SymbolFileOnDemand");
   }
   return m_sym_file_impl->GetTypeSystemForLanguage(language);
 }
diff --git a/lldb/source/Symbol/TypeSystem.cpp b/lldb/source/Symbol/TypeSystem.cpp
index 3665771b1889..4956f10a0b0a 100644
--- a/lldb/source/Symbol/TypeSystem.cpp
+++ b/lldb/source/Symbol/TypeSystem.cpp
@@ -267,9 +267,8 @@ llvm::Expected<lldb::TypeSystemSP> TypeSystemMap::GetTypeSystemForLanguage(
     std::optional<CreateCallback> create_callback) {
   std::lock_guard<std::mutex> guard(m_mutex);
   if (m_clear_in_progress)
-    return llvm::make_error<llvm::StringError>(
-        "Unable to get TypeSystem because TypeSystemMap is being cleared",
-        llvm::inconvertibleErrorCode());
+    return llvm::createStringError(
+        "Unable to get TypeSystem because TypeSystemMap is being cleared");
 
   collection::iterator pos = m_map.find(language);
   if (pos != m_map.end()) {
@@ -277,11 +276,10 @@ llvm::Expected<lldb::TypeSystemSP> TypeSystemMap::GetTypeSystemForLanguage(
       assert(!pos->second->weak_from_this().expired());
       return pos->second;
     }
-    return llvm::make_error<llvm::StringError>(
+    return llvm::createStringError(
         "TypeSystem for language " +
-            llvm::StringRef(Language::GetNameForLanguageType(language)) +
-            " doesn't exist",
-        llvm::inconvertibleErrorCode());
+        llvm::StringRef(Language::GetNameForLanguageType(language)) +
+        " doesn't exist");
   }
 
   for (const auto &pair : m_map) {
@@ -291,31 +289,27 @@ llvm::Expected<lldb::TypeSystemSP> TypeSystemMap::GetTypeSystemForLanguage(
       m_map[language] = pair.second;
       if (pair.second)
         return pair.second;
-      return llvm::make_error<llvm::StringError>(
+      return llvm::createStringError(
           "TypeSystem for language " +
-              llvm::StringRef(Language::GetNameForLanguageType(language)) +
-              " doesn't exist",
-          llvm::inconvertibleErrorCode());
+          llvm::StringRef(Language::GetNameForLanguageType(language)) +
+          " doesn't exist");
     }
   }
 
   if (!create_callback)
-    return llvm::make_error<llvm::StringError>(
+    return llvm::createStringError(
         "Unable to find type system for language " +
-            llvm::StringRef(Language::GetNameForLanguageType(language)),
-        llvm::inconvertibleErrorCode());
-
+        llvm::StringRef(Language::GetNameForLanguageType(language)));
   // Cache even if we get a shared pointer that contains a null type system
   // back.
   TypeSystemSP type_system_sp = (*create_callback)();
   m_map[language] = type_system_sp;
   if (type_system_sp)
     return type_system_sp;
-  return llvm::make_error<llvm::StringError>(
+  return llvm::createStringError(
       "TypeSystem for language " +
-          llvm::StringRef(Language::GetNameForLanguageType(language)) +
-          " doesn't exist",
-      llvm::inconvertibleErrorCode());
+      llvm::StringRef(Language::GetNameForLanguageType(language)) +
+      " doesn't exist");
 }
 
 llvm::Expected<lldb::TypeSystemSP>
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index 77731167995e..ec0da8a1378a 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -2414,8 +2414,7 @@ llvm::Expected<lldb::TypeSystemSP>
 Target::GetScratchTypeSystemForLanguage(lldb::LanguageType language,
                                         bool create_on_demand) {
   if (!m_valid)
-    return llvm::make_error<llvm::StringError>("Invalid Target",
-                                               llvm::inconvertibleErrorCode());
+    return llvm::createStringError("Invalid Target");
 
   if (language == eLanguageTypeMipsAssembler // GNU AS and LLVM use it for all
                                              // assembly code
@@ -2428,9 +2427,8 @@ Target::GetScratchTypeSystemForLanguage(lldb::LanguageType language,
                                  // target language.
     } else {
       if (languages_for_expressions.Empty())
-        return llvm::make_error<llvm::StringError>(
-            "No expression support for any languages",
-            llvm::inconvertibleErrorCode());
+        return llvm::createStringError(
+            "No expression support for any languages");
       language = (LanguageType)languages_for_expressions.bitvector.find_first();
     }
   }
@@ -2574,23 +2572,20 @@ Target::CreateUtilityFunction(std::string expression, std::string name,
     return type_system_or_err.takeError();
   auto ts = *type_system_or_err;
   if (!ts)
-    return llvm::make_error<llvm::StringError>(
+    return llvm::createStringError(
         llvm::StringRef("Type system for language ") +
-            Language::GetNameForLanguageType(language) +
-            llvm::StringRef(" is no longer live"),
-        llvm::inconvertibleErrorCode());
+        Language::GetNameForLanguageType(language) +
+        llvm::StringRef(" is no longer live"));
   std::unique_ptr<UtilityFunction> utility_fn =
       ts->CreateUtilityFunction(std::move(expression), std::move(name));
   if (!utility_fn)
-    return llvm::make_error<llvm::StringError>(
+    return llvm::createStringError(
         llvm::StringRef("Could not create an expression for language") +
-            Language::GetNameForLanguageType(language),
-        llvm::inconvertibleErrorCode());
+        Language::GetNameForLanguageType(language));
 
   DiagnosticManager diagnostics;
   if (!utility_fn->Install(diagnostics, exe_ctx))
-    return llvm::make_error<llvm::StringError>(diagnostics.GetString(),
-                                               llvm::inconvertibleErrorCode());
+    return llvm::createStringError(diagnostics.GetString());
 
   return std::move(utility_fn);
 }
@@ -2621,8 +2616,7 @@ void Target::SetDefaultArchitecture(const ArchSpec &arch) {
 llvm::Error Target::SetLabel(llvm::StringRef label) {
   size_t n = LLDB_INVALID_INDEX32;
   if (llvm::to_integer(label, n))
-    return llvm::make_error<llvm::StringError>(
-        "Cannot use integer as target label.", llvm::inconvertibleErrorCode());
+    return llvm::createStringError("Cannot use integer as target label.");
   TargetList &targets = GetDebugger().GetTargetList();
   for (size_t i = 0; i < targets.GetNumTargets(); i++) {
     TargetSP target_sp = targets.GetTargetAtIndex(i);
@@ -2790,15 +2784,13 @@ llvm::Expected<lldb_private::Address> Target::GetEntryPointAddress() {
 
   // We haven't found the entry point address. Return an appropriate error.
   if (!has_primary_executable)
-    return llvm::make_error<llvm::StringError>(
+    return llvm::createStringError(
         "No primary executable found and could not find entry point address in "
-        "any executable module",
-        llvm::inconvertibleErrorCode());
+        "any executable module");
 
-  return llvm::make_error<llvm::StringError>(
+  return llvm::createStringError(
       "Could not find entry point address for primary executable module \"" +
-          exe_module->GetFileSpec().GetFilename().GetStringRef() + "\"",
-      llvm::inconvertibleErrorCode());
+      exe_module->GetFileSpec().GetFilename().GetStringRef() + "\"");
 }
 
 lldb::addr_t Target::GetCallableLoadAddress(lldb::addr_t load_addr,
diff --git a/lldb/source/Utility/Status.cpp b/lldb/source/Utility/Status.cpp
index 3bd00bb20da2..18312e87f03e 100644
--- a/lldb/source/Utility/Status.cpp
+++ b/lldb/source/Utility/Status.cpp
@@ -92,8 +92,7 @@ llvm::Error Status::ToError() const {
   if (m_type == ErrorType::eErrorTypePOSIX)
     return llvm::errorCodeToError(
         std::error_code(m_code, std::generic_category()));
-  return llvm::make_error<llvm::StringError>(AsCString(),
-                                             llvm::inconvertibleErrorCode());
+  return llvm::createStringError(AsCString());
 }
 
 Status::~Status() = default;
diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py
index c219a4ee5bd9..605561c75737 100644
--- a/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py
+++ b/lldb/test/API/functionalities/breakpoint/breakpoint_command/TestBreakpointCommand.py
@@ -6,7 +6,7 @@ Test lldb breakpoint command add/list/delete.
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
-from lldbsuite.test import lldbutil
+from lldbsuite.test import lldbutil, lldbplatformutil
 import json
 import os
 import side_effect
@@ -581,7 +581,6 @@ class BreakpointCommandTestCase(TestBase):
         self.assertNotEqual(target_stats, None)
         self.assertEqual(target_stats["sourceMapDeduceCount"], expected_count)
 
-    @skipIf(oslist=["windows"])
     @no_debug_info_test
     def test_breakpoints_auto_source_map_relative(self):
         """
@@ -612,8 +611,13 @@ class BreakpointCommandTestCase(TestBase):
         self.verify_source_map_deduce_statistics(target, 0)
 
         # Verify auto deduced source map when file path in debug info
-        # is a suffix of request breakpoint file path
-        path = "/x/y/a/b/c/main.cpp"
+        # is a suffix of request breakpoint file path.
+        # Note the path must be absolute.
+        path = (
+            "/x/y/a/b/c/main.cpp"
+            if lldbplatformutil.getHostPlatform() != "windows"
+            else r"C:\x\y\a\b\c\main.cpp"
+        )
         bp = target.BreakpointCreateByLocation(path, 2)
         self.assertGreater(
             bp.GetNumLocations(),
@@ -625,7 +629,11 @@ class BreakpointCommandTestCase(TestBase):
 
         source_map_json = self.get_source_map_json()
         self.assertEqual(len(source_map_json), 1, "source map should not be empty")
-        self.verify_source_map_entry_pair(source_map_json[0], ".", "/x/y")
+        self.verify_source_map_entry_pair(
+            source_map_json[0],
+            ".",
+            "/x/y" if lldbplatformutil.getHostPlatform() != "windows" else r"C:\x\y",
+        )
         self.verify_source_map_deduce_statistics(target, 1)
 
         # Reset source map.
diff --git a/lldb/test/API/functionalities/thread/exit_during_expression/main.c b/lldb/test/API/functionalities/thread/exit_during_expression/main.c
index eb6d17520986..f633632e96cc 100644
--- a/lldb/test/API/functionalities/thread/exit_during_expression/main.c
+++ b/lldb/test/API/functionalities/thread/exit_during_expression/main.c
@@ -3,7 +3,7 @@
 #include <stdio.h>
 #include <unistd.h>
 
-static unsigned int g_timeout = 100000;
+static unsigned int g_timeout = 1000000;
 
 extern int usleep(unsigned int);
 
diff --git a/lldb/test/API/lang/c/enum_types/TestEnumTypes.py b/lldb/test/API/lang/c/enum_types/TestEnumTypes.py
index 33a846c50d7d..0015c8f47857 100644
--- a/lldb/test/API/lang/c/enum_types/TestEnumTypes.py
+++ b/lldb/test/API/lang/c/enum_types/TestEnumTypes.py
@@ -26,7 +26,9 @@ class EnumTypesTestCase(TestBase):
         self.expect("fr var b", DATA_TYPES_DISPLAYED_CORRECTLY, patterns=[" = B$"])
         self.expect("fr var c", DATA_TYPES_DISPLAYED_CORRECTLY, patterns=[" = C$"])
         self.expect("fr var ab", DATA_TYPES_DISPLAYED_CORRECTLY, patterns=[" = AB$"])
-        self.expect("fr var ac", DATA_TYPES_DISPLAYED_CORRECTLY, patterns=[" = A | C$"])
+        self.expect(
+            "fr var ac", DATA_TYPES_DISPLAYED_CORRECTLY, patterns=[" = A \| C$"]
+        )
         self.expect("fr var all", DATA_TYPES_DISPLAYED_CORRECTLY, patterns=[" = ALL$"])
         # Test that an enum that doesn't match the heuristic we use in
         # TypeSystemClang::DumpEnumValue, gets printed as a raw integer.
@@ -37,7 +39,7 @@ class EnumTypesTestCase(TestBase):
         self.expect(
             "expression (enum bitfield)nonsense",
             DATA_TYPES_DISPLAYED_CORRECTLY,
-            patterns=[" = B | C | 0x10$"],
+            patterns=[" = B \| C \| 0x10$"],
         )
 
         # Break inside the main.
diff --git a/lldb/test/API/python_api/debugger/TestDebuggerAPI.py b/lldb/test/API/python_api/debugger/TestDebuggerAPI.py
index 29b8cfadd947..a007a87ca93e 100644
--- a/lldb/test/API/python_api/debugger/TestDebuggerAPI.py
+++ b/lldb/test/API/python_api/debugger/TestDebuggerAPI.py
@@ -91,6 +91,11 @@ class DebuggerAPITestCase(TestBase):
         # Test the local property again, is it set to new_cache_line_size?
         self.assertEqual(get_cache_line_size(), new_cache_line_size)
 
+    @expectedFailureAll(
+        hostoslist=["windows"],
+        remote=True,
+        bugnumber="github.com/llvm/llvm-project/issues/92419",
+    )
     def test_CreateTarget_platform(self):
         exe = self.getBuildArtifact("a.out")
         self.yaml2obj("elf.yaml", exe)
diff --git a/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py b/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py
index cab0067382ca..b3ba69749f67 100644
--- a/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py
+++ b/lldb/test/API/tools/lldb-dap/attach/TestDAP_attach.py
@@ -41,7 +41,6 @@ class TestDAP_attach(lldbdap_testcase.DAPTestCaseBase):
 
     @skipIfWindows
     @skipIfNetBSD  # Hangs on NetBSD as well
-    @skipIfRemote
     def test_by_pid(self):
         """
         Tests attaching to a process by process ID.
@@ -59,7 +58,6 @@ class TestDAP_attach(lldbdap_testcase.DAPTestCaseBase):
 
     @skipIfWindows
     @skipIfNetBSD  # Hangs on NetBSD as well
-    @skipIfRemote
     def test_by_name(self):
         """
         Tests attaching to a process by process name.
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_logpoints.py b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_logpoints.py
index cbf190f2b2bf..78ceb7971112 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_logpoints.py
+++ b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_logpoints.py
@@ -20,7 +20,6 @@ class TestDAP_logpoints(lldbdap_testcase.DAPTestCaseBase):
         self.main_path = os.path.realpath(self.getBuildArtifact(self.main_basename))
 
     @skipIfWindows
-    @skipIfRemote
     def test_logmessage_basic(self):
         """Tests breakpoint logmessage basic functionality."""
         before_loop_line = line_number("main.cpp", "// before loop")
@@ -83,7 +82,6 @@ class TestDAP_logpoints(lldbdap_testcase.DAPTestCaseBase):
             self.assertRegex(logMessage_line, reg_str)
 
     @skipIfWindows
-    @skipIfRemote
     def test_logmessage_advanced(self):
         """Tests breakpoint logmessage functionality for complex expression."""
         before_loop_line = line_number("main.cpp", "// before loop")
@@ -144,7 +142,6 @@ class TestDAP_logpoints(lldbdap_testcase.DAPTestCaseBase):
             self.assertEqual(logMessage_line, logMessage_prefix + str(result))
 
     @skipIfWindows
-    @skipIfRemote
     def test_logmessage_format(self):
         """
         Tests breakpoint logmessage functionality with format.
@@ -209,7 +206,6 @@ class TestDAP_logpoints(lldbdap_testcase.DAPTestCaseBase):
             )
 
     @skipIfWindows
-    @skipIfRemote
     def test_logmessage_format_failure(self):
         """
         Tests breakpoint logmessage format with parsing failure.
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
index 6f57c05e43c8..123fea79c5cd 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
+++ b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setBreakpoints.py
@@ -20,7 +20,6 @@ class TestDAP_setBreakpoints(lldbdap_testcase.DAPTestCaseBase):
         self.main_path = os.path.realpath(self.getBuildArtifact(self.main_basename))
 
     @skipIfWindows
-    @skipIfRemote
     def test_source_map(self):
         """
         This test simulates building two files in a folder, and then moving
@@ -99,7 +98,6 @@ class TestDAP_setBreakpoints(lldbdap_testcase.DAPTestCaseBase):
         self.assertEqual(frames[1]["source"]["path"], new_main_path)
 
     @skipIfWindows
-    @skipIfRemote
     def test_set_and_clear(self):
         """Tests setting and clearing source file and line breakpoints.
         This packet is a bit tricky on the debug adaptor side since there
@@ -261,7 +259,6 @@ class TestDAP_setBreakpoints(lldbdap_testcase.DAPTestCaseBase):
                 )
 
     @skipIfWindows
-    @skipIfRemote
     def test_clear_breakpoints_unset_breakpoints(self):
         """Test clearing breakpoints like test_set_and_clear, but clear
         breakpoints by omitting the breakpoints array instead of sending an
@@ -305,7 +302,6 @@ class TestDAP_setBreakpoints(lldbdap_testcase.DAPTestCaseBase):
         self.assertEqual(len(breakpoints), 0, "expect no source breakpoints")
 
     @skipIfWindows
-    @skipIfRemote
     def test_functionality(self):
         """Tests hitting breakpoints and the functionality of a single
         breakpoint, like 'conditions' and 'hitCondition' settings."""
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setExceptionBreakpoints.py b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setExceptionBreakpoints.py
index 84d3f12490f3..b2ab12e51bf6 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setExceptionBreakpoints.py
+++ b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setExceptionBreakpoints.py
@@ -12,7 +12,6 @@ import lldbdap_testcase
 
 class TestDAP_setExceptionBreakpoints(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     def test_functionality(self):
         """Tests setting and clearing exception breakpoints.
         This packet is a bit tricky on the debug adaptor side since there
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setFunctionBreakpoints.py b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setFunctionBreakpoints.py
index 9708effb7a1a..8f00f42574b5 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setFunctionBreakpoints.py
+++ b/lldb/test/API/tools/lldb-dap/breakpoint/TestDAP_setFunctionBreakpoints.py
@@ -12,7 +12,6 @@ import lldbdap_testcase
 
 class TestDAP_setFunctionBreakpoints(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     def test_set_and_clear(self):
         """Tests setting and clearing function breakpoints.
         This packet is a bit tricky on the debug adaptor side since there
@@ -123,7 +122,6 @@ class TestDAP_setFunctionBreakpoints(lldbdap_testcase.DAPTestCaseBase):
             )
 
     @skipIfWindows
-    @skipIfRemote
     def test_functionality(self):
         """Tests hitting breakpoints and the functionality of a single
         breakpoint, like 'conditions' and 'hitCondition' settings."""
diff --git a/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py b/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py
index bfdf9ef2897b..226b9385fe71 100644
--- a/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py
+++ b/lldb/test/API/tools/lldb-dap/commands/TestDAP_commands.py
@@ -7,7 +7,6 @@ from lldbsuite.test.decorators import *
 
 
 class TestDAP_commands(lldbdap_testcase.DAPTestCaseBase):
-    @skipIfRemote
     def test_command_directive_quiet_on_success(self):
         program = self.getBuildArtifact("a.out")
         command_quiet = (
@@ -61,7 +60,6 @@ class TestDAP_commands(lldbdap_testcase.DAPTestCaseBase):
     def test_command_directive_abort_on_error_pre_run_commands(self):
         self.do_test_abort_on_error(use_pre_run_commands=True)
 
-    @skipIfRemote
     def test_command_directive_abort_on_error_post_run_commands(self):
         self.do_test_abort_on_error(use_post_run_commands=True)
 
diff --git a/lldb/test/API/tools/lldb-dap/completions/TestDAP_completions.py b/lldb/test/API/tools/lldb-dap/completions/TestDAP_completions.py
index 3250a5093cac..2b3ec656c107 100644
--- a/lldb/test/API/tools/lldb-dap/completions/TestDAP_completions.py
+++ b/lldb/test/API/tools/lldb-dap/completions/TestDAP_completions.py
@@ -19,7 +19,6 @@ class TestDAP_completions(lldbdap_testcase.DAPTestCaseBase):
             self.assertNotIn(not_expected_item, actual_list)
 
     @skipIfWindows
-    @skipIfRemote
     @skipIf(compiler="clang", compiler_version=["<", "17.0"])
     def test_completions(self):
         """
diff --git a/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py b/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py
index 8769f39633e6..e6345818bf08 100644
--- a/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py
+++ b/lldb/test/API/tools/lldb-dap/console/TestDAP_console.py
@@ -38,7 +38,6 @@ class TestDAP_console(lldbdap_testcase.DAPTestCaseBase):
         )
 
     @skipIfWindows
-    @skipIfRemote
     def test_scopes_variables_setVariable_evaluate(self):
         """
         Tests that the "scopes" request causes the currently selected
@@ -82,7 +81,6 @@ class TestDAP_console(lldbdap_testcase.DAPTestCaseBase):
         self.check_lldb_command("frame select", "frame #1", "frame 1 is selected")
 
     @skipIfWindows
-    @skipIfRemote
     def test_custom_escape_prefix(self):
         program = self.getBuildArtifact("a.out")
         self.build_and_launch(program, commandEscapePrefix="::")
@@ -99,7 +97,6 @@ class TestDAP_console(lldbdap_testcase.DAPTestCaseBase):
         )
 
     @skipIfWindows
-    @skipIfRemote
     def test_empty_escape_prefix(self):
         program = self.getBuildArtifact("a.out")
         self.build_and_launch(program, commandEscapePrefix="")
@@ -116,7 +113,6 @@ class TestDAP_console(lldbdap_testcase.DAPTestCaseBase):
         )
 
     @skipIfWindows
-    @skipIfRemote
     def test_exit_status_message_sigterm(self):
         source = "main.cpp"
         program = self.getBuildArtifact("a.out")
@@ -154,7 +150,6 @@ class TestDAP_console(lldbdap_testcase.DAPTestCaseBase):
         )
 
     @skipIfWindows
-    @skipIfRemote
     def test_exit_status_message_ok(self):
         source = "main.cpp"
         program = self.getBuildArtifact("a.out")
diff --git a/lldb/test/API/tools/lldb-dap/console/TestDAP_redirection_to_console.py b/lldb/test/API/tools/lldb-dap/console/TestDAP_redirection_to_console.py
index 85911a449efe..8b47d4b9d681 100644
--- a/lldb/test/API/tools/lldb-dap/console/TestDAP_redirection_to_console.py
+++ b/lldb/test/API/tools/lldb-dap/console/TestDAP_redirection_to_console.py
@@ -8,7 +8,6 @@ import lldbdap_testcase
 
 class TestDAP_redirection_to_console(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     def test(self):
         """
         Without proper stderr and stdout redirection, the following code would throw an
diff --git a/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py b/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py
index cabaeafc4a64..3c847dc269b2 100644
--- a/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py
+++ b/lldb/test/API/tools/lldb-dap/coreFile/TestDAP_coreFile.py
@@ -13,7 +13,6 @@ import os
 
 class TestDAP_coreFile(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     @skipIfLLVMTargetMissing("X86")
     def test_core_file(self):
         current_dir = os.path.dirname(__file__)
@@ -60,7 +59,6 @@ class TestDAP_coreFile(lldbdap_testcase.DAPTestCaseBase):
         self.assertEqual(self.get_stackFrames(), expected_frames)
 
     @skipIfWindows
-    @skipIfRemote
     @skipIfLLVMTargetMissing("X86")
     def test_core_file_source_mapping(self):
         """Test that sourceMap property is correctly applied when loading a core"""
diff --git a/lldb/test/API/tools/lldb-dap/databreakpoint/TestDAP_setDataBreakpoints.py b/lldb/test/API/tools/lldb-dap/databreakpoint/TestDAP_setDataBreakpoints.py
index 1e0e40d4a013..a542a318050d 100644
--- a/lldb/test/API/tools/lldb-dap/databreakpoint/TestDAP_setDataBreakpoints.py
+++ b/lldb/test/API/tools/lldb-dap/databreakpoint/TestDAP_setDataBreakpoints.py
@@ -13,7 +13,6 @@ class TestDAP_setDataBreakpoints(lldbdap_testcase.DAPTestCaseBase):
         self.accessTypes = ["read", "write", "readWrite"]
 
     @skipIfWindows
-    @skipIfRemote
     def test_duplicate_start_addresses(self):
         """Test setDataBreakpoints with multiple watchpoints starting at the same addresses."""
         program = self.getBuildArtifact("a.out")
@@ -58,7 +57,6 @@ class TestDAP_setDataBreakpoints(lldbdap_testcase.DAPTestCaseBase):
         self.assertEqual(i_val, "2")
 
     @skipIfWindows
-    @skipIfRemote
     def test_expression(self):
         """Tests setting data breakpoints on expression."""
         program = self.getBuildArtifact("a.out")
@@ -99,7 +97,6 @@ class TestDAP_setDataBreakpoints(lldbdap_testcase.DAPTestCaseBase):
         self.assertEqual(i_val, "2")
 
     @skipIfWindows
-    @skipIfRemote
     def test_functionality(self):
         """Tests setting data breakpoints on variable."""
         program = self.getBuildArtifact("a.out")
diff --git a/lldb/test/API/tools/lldb-dap/disassemble/TestDAP_disassemble.py b/lldb/test/API/tools/lldb-dap/disassemble/TestDAP_disassemble.py
index 1b96ea71659f..9e8ef5b289f2 100644
--- a/lldb/test/API/tools/lldb-dap/disassemble/TestDAP_disassemble.py
+++ b/lldb/test/API/tools/lldb-dap/disassemble/TestDAP_disassemble.py
@@ -13,7 +13,6 @@ import os
 
 class TestDAP_disassemble(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     def test_disassemble(self):
         """
         Tests the 'disassemble' request.
diff --git a/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py b/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py
index e5aab88c7fa4..f9e461adecb1 100644
--- a/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py
+++ b/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py
@@ -24,7 +24,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         self.assertTrue(output is None or len(output) == 0)
 
     @skipIfWindows
-    @skipIfRemote
     def test_launch(self):
         """
         This test launches a process that would creates a file, but we disconnect
@@ -46,7 +45,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         self.assertFalse(os.path.exists(program + ".side_effect"))
 
     @skipIfWindows
-    @skipIfRemote
     @expectedFailureNetBSD
     def test_attach(self):
         """
diff --git a/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py b/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
index 68c57ad77554..29548a835c69 100644
--- a/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
+++ b/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
@@ -192,31 +192,26 @@ class TestDAP_evaluate(lldbdap_testcase.DAPTestCaseBase):
         self.assertEvaluate("my_bool_vec", "size=2")
 
     @skipIfWindows
-    @skipIfRemote
     def test_generic_evaluate_expressions(self):
         # Tests context-less expression evaluations
         self.run_test_evaluate_expressions(enableAutoVariableSummaries=False)
 
     @skipIfWindows
-    @skipIfRemote
     def test_repl_evaluate_expressions(self):
         # Tests expression evaluations that are triggered from the Debug Console
         self.run_test_evaluate_expressions("repl", enableAutoVariableSummaries=False)
 
     @skipIfWindows
-    @skipIfRemote
     def test_watch_evaluate_expressions(self):
         # Tests expression evaluations that are triggered from a watch expression
         self.run_test_evaluate_expressions("watch", enableAutoVariableSummaries=True)
 
     @skipIfWindows
-    @skipIfRemote
     def test_hover_evaluate_expressions(self):
         # Tests expression evaluations that are triggered when hovering on the editor
         self.run_test_evaluate_expressions("hover", enableAutoVariableSummaries=False)
 
     @skipIfWindows
-    @skipIfRemote
     def test_variable_evaluate_expressions(self):
         # Tests expression evaluations that are triggered in the variable explorer
         self.run_test_evaluate_expressions("variable", enableAutoVariableSummaries=True)
diff --git a/lldb/test/API/tools/lldb-dap/exception/TestDAP_exception.py b/lldb/test/API/tools/lldb-dap/exception/TestDAP_exception.py
index 58a67d816436..8c2c0154ba65 100644
--- a/lldb/test/API/tools/lldb-dap/exception/TestDAP_exception.py
+++ b/lldb/test/API/tools/lldb-dap/exception/TestDAP_exception.py
@@ -9,7 +9,6 @@ import lldbdap_testcase
 
 
 class TestDAP_exception(lldbdap_testcase.DAPTestCaseBase):
-    @skipIfRemote
     @skipIfWindows
     def test_stopped_description(self):
         """
diff --git a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
index 0760d358d9c0..05873e926b64 100644
--- a/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
+++ b/lldb/test/API/tools/lldb-dap/launch/TestDAP_launch.py
@@ -13,7 +13,6 @@ import os
 
 class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     def test_default(self):
         """
         Tests the default launch of a simple program. No arguments,
@@ -29,7 +28,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         self.assertIn(program, lines[0], "make sure program path is in first argument")
 
     @skipIfWindows
-    @skipIfRemote
     def test_termination(self):
         """
         Tests the correct termination of lldb-dap upon a 'disconnect'
@@ -50,7 +48,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         self.assertEqual(self.dap_server.process.poll(), 0)
 
     @skipIfWindows
-    @skipIfRemote
     def test_stopOnEntry(self):
         """
         Tests the default launch of a simple program that stops at the
@@ -70,7 +67,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
                     )
 
     @skipIfWindows
-    @skipIfRemote
     def test_cwd(self):
         """
         Tests the default launch of a simple program with a current working
@@ -97,7 +93,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         self.assertTrue(found, "verified program working directory")
 
     @skipIfWindows
-    @skipIfRemote
     def test_debuggerRoot(self):
         """
         Tests the "debuggerRoot" will change the working directory of
@@ -127,7 +122,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         self.continue_to_exit()
 
     @skipIfWindows
-    @skipIfRemote
     def test_sourcePath(self):
         """
         Tests the "sourcePath" will set the target.source-map.
@@ -153,7 +147,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         self.continue_to_exit()
 
     @skipIfWindows
-    @skipIfRemote
     def test_disableSTDIO(self):
         """
         Tests the default launch of a simple program with STDIO disabled.
@@ -168,7 +161,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
     @skipIfLinux  # shell argument expansion doesn't seem to work on Linux
     @expectedFailureAll(oslist=["freebsd", "netbsd"], bugnumber="llvm.org/pr48349")
-    @skipIfRemote
     def test_shellExpandArguments_enabled(self):
         """
         Tests the default launch of a simple program with shell expansion
@@ -191,7 +183,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
                 )
 
     @skipIfWindows
-    @skipIfRemote
     def test_shellExpandArguments_disabled(self):
         """
         Tests the default launch of a simple program with shell expansion
@@ -214,7 +205,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
                 )
 
     @skipIfWindows
-    @skipIfRemote
     def test_args(self):
         """
         Tests launch of a simple program with arguments
@@ -240,7 +230,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
             )
 
     @skipIfWindows
-    @skipIfRemote
     def test_environment(self):
         """
         Tests launch of a simple program with environment variables
@@ -270,7 +259,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
             )
 
     @skipIfWindows
-    @skipIfRemote
     @skipIf(
         archs=["arm", "aarch64"]
     )  # failed run https://lab.llvm.org/buildbot/#/builders/96/builds/6933
@@ -354,7 +342,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         self.verify_commands("terminateCommands", output, terminateCommands)
 
     @skipIfWindows
-    @skipIfRemote
     def test_extra_launch_commands(self):
         """
         Tests the "launchCommands" with extra launching settings
@@ -420,7 +407,6 @@ class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
         self.verify_commands("exitCommands", output, exitCommands)
 
     @skipIfWindows
-    @skipIfRemote
     def test_failing_launch_commands(self):
         """
         Tests "launchCommands" failures prevents a launch.
diff --git a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
index 3f3ead033ce9..a4e0f04d450d 100644
--- a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
+++ b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py
@@ -58,7 +58,6 @@ class TestDAP_module(lldbdap_testcase.DAPTestCaseBase):
         self.assertIn("addressRange", program_module)
 
     @skipIfWindows
-    @skipIfRemote
     def test_modules(self):
         """
         Mac or linux.
@@ -74,7 +73,6 @@ class TestDAP_module(lldbdap_testcase.DAPTestCaseBase):
         )
 
     @skipUnlessDarwin
-    @skipIfRemote
     def test_modules_dsym(self):
         """
         Darwin only test with dSYM file.
@@ -85,7 +83,6 @@ class TestDAP_module(lldbdap_testcase.DAPTestCaseBase):
         return self.run_test("a.out.dSYM", expect_debug_info_size=True)
 
     @skipIfWindows
-    @skipIfRemote
     def test_compile_units(self):
         program = self.getBuildArtifact("a.out")
         self.build_and_launch(program)
diff --git a/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py b/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py
index 90b130d3af4d..dc7f4f98875f 100644
--- a/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py
+++ b/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py
@@ -11,7 +11,6 @@ from lldbsuite.test.lldbtest import *
 
 class TestDAP_optimized(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     def test_stack_frame_name(self):
         """Test optimized frame has special name suffix."""
         program = self.getBuildArtifact("a.out")
@@ -30,7 +29,6 @@ class TestDAP_optimized(lldbdap_testcase.DAPTestCaseBase):
         self.assertTrue(parent_frame["name"].endswith(" [opt]"))
 
     @skipIfWindows
-    @skipIfRemote
     def test_optimized_variable(self):
         """Test optimized variable value contains error."""
         program = self.getBuildArtifact("a.out")
diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py
index 32dbc82a5729..36fa0bd40183 100644
--- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py
+++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart.py
@@ -9,7 +9,6 @@ import lldbdap_testcase
 
 class TestDAP_restart(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     def test_basic_functionality(self):
         """
         Tests the basic restarting functionality: set two breakpoints in
@@ -45,7 +44,6 @@ class TestDAP_restart(lldbdap_testcase.DAPTestCaseBase):
         )
 
     @skipIfWindows
-    @skipIfRemote
     def test_stopOnEntry(self):
         """
         Check that the stopOnEntry setting is still honored after a restart.
@@ -87,7 +85,6 @@ class TestDAP_restart(lldbdap_testcase.DAPTestCaseBase):
                     )
 
     @skipIfWindows
-    @skipIfRemote
     def test_arguments(self):
         """
         Tests that lldb-dap will use updated launch arguments included
diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py
index c19a6d5b54cd..5a9938c25c2c 100644
--- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py
+++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py
@@ -21,7 +21,6 @@ class TestDAP_restart_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
             return False
 
     @skipIfWindows
-    @skipIfRemote
     @skipIf(archs=["arm"])  # Always times out on buildbot
     def test_basic_functionality(self):
         """
@@ -62,7 +61,6 @@ class TestDAP_restart_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
         )
 
     @skipIfWindows
-    @skipIfRemote
     @skipIf(archs=["arm"])  # Always times out on buildbot
     def test_stopOnEntry(self):
         """
diff --git a/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py b/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
index f79a31988dc6..9fcd210122d5 100644
--- a/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
+++ b/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py
@@ -44,7 +44,6 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
             return False
 
     @skipIfWindows
-    @skipIfRemote
     @skipIf(archs=no_match(["x86_64"]))
     def test_runInTerminal(self):
         if not self.isTestSupported():
@@ -92,7 +91,6 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
         self.assertIn("bar", env)
 
     @skipIfWindows
-    @skipIfRemote
     @skipIf(archs=no_match(["x86_64"]))
     def test_runInTerminalInvalidTarget(self):
         if not self.isTestSupported():
@@ -112,7 +110,6 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
         )
 
     @skipIfWindows
-    @skipIfRemote
     @skipIf(archs=no_match(["x86_64"]))
     def test_missingArgInRunInTerminalLauncher(self):
         if not self.isTestSupported():
@@ -128,7 +125,6 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
         )
 
     @skipIfWindows
-    @skipIfRemote
     @skipIf(archs=no_match(["x86_64"]))
     def test_FakeAttachedRunInTerminalLauncherWithInvalidProgram(self):
         if not self.isTestSupported():
@@ -156,7 +152,6 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
         self.assertIn("No such file or directory", stderr)
 
     @skipIfWindows
-    @skipIfRemote
     @skipIf(archs=no_match(["x86_64"]))
     def test_FakeAttachedRunInTerminalLauncherWithValidProgram(self):
         if not self.isTestSupported():
@@ -184,7 +179,6 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
         self.assertIn("foo", stdout)
 
     @skipIfWindows
-    @skipIfRemote
     @skipIf(archs=no_match(["x86_64"]))
     def test_FakeAttachedRunInTerminalLauncherAndCheckEnvironment(self):
         if not self.isTestSupported():
@@ -206,7 +200,6 @@ class TestDAP_runInTerminal(lldbdap_testcase.DAPTestCaseBase):
         self.assertIn("FOO=BAR", stdout)
 
     @skipIfWindows
-    @skipIfRemote
     @skipIf(archs=no_match(["x86_64"]))
     def test_NonAttachedRunInTerminalLauncher(self):
         if not self.isTestSupported():
diff --git a/lldb/test/API/tools/lldb-dap/stackTrace/TestDAP_stackTrace.py b/lldb/test/API/tools/lldb-dap/stackTrace/TestDAP_stackTrace.py
index 70526cc71538..0d7776faa4a9 100644
--- a/lldb/test/API/tools/lldb-dap/stackTrace/TestDAP_stackTrace.py
+++ b/lldb/test/API/tools/lldb-dap/stackTrace/TestDAP_stackTrace.py
@@ -57,7 +57,6 @@ class TestDAP_stackTrace(lldbdap_testcase.DAPTestCaseBase):
         )
 
     @skipIfWindows
-    @skipIfRemote
     def test_stackTrace(self):
         """
         Tests the 'stackTrace' packet and all its variants.
@@ -190,7 +189,6 @@ class TestDAP_stackTrace(lldbdap_testcase.DAPTestCaseBase):
         )
 
     @skipIfWindows
-    @skipIfRemote
     def test_functionNameWithArgs(self):
         """
         Test that the stack frame without a function name is given its pc in the response.
diff --git a/lldb/test/API/tools/lldb-dap/stackTraceMissingFunctionName/TestDAP_stackTraceMissingFunctionName.py b/lldb/test/API/tools/lldb-dap/stackTraceMissingFunctionName/TestDAP_stackTraceMissingFunctionName.py
index 0011c0f616e1..a04c752764fb 100644
--- a/lldb/test/API/tools/lldb-dap/stackTraceMissingFunctionName/TestDAP_stackTraceMissingFunctionName.py
+++ b/lldb/test/API/tools/lldb-dap/stackTraceMissingFunctionName/TestDAP_stackTraceMissingFunctionName.py
@@ -13,7 +13,6 @@ from lldbsuite.test import lldbtest, lldbutil
 
 class TestDAP_stackTraceMissingFunctionName(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     def test_missingFunctionName(self):
         """
         Test that the stack frame without a function name is given its pc in the response.
diff --git a/lldb/test/API/tools/lldb-dap/startDebugging/TestDAP_startDebugging.py b/lldb/test/API/tools/lldb-dap/startDebugging/TestDAP_startDebugging.py
index 7700c65f862d..fd48e69cae5e 100644
--- a/lldb/test/API/tools/lldb-dap/startDebugging/TestDAP_startDebugging.py
+++ b/lldb/test/API/tools/lldb-dap/startDebugging/TestDAP_startDebugging.py
@@ -11,7 +11,6 @@ import lldbdap_testcase
 
 
 class TestDAP_startDebugging(lldbdap_testcase.DAPTestCaseBase):
-    @skipIfRemote
     def test_startDebugging(self):
         """
         Tests the "startDebugging" reverse request. It makes sure that the IDE can
diff --git a/lldb/test/API/tools/lldb-dap/step/TestDAP_step.py b/lldb/test/API/tools/lldb-dap/step/TestDAP_step.py
index 578e64e36ea0..8a1bb76340be 100644
--- a/lldb/test/API/tools/lldb-dap/step/TestDAP_step.py
+++ b/lldb/test/API/tools/lldb-dap/step/TestDAP_step.py
@@ -12,7 +12,6 @@ import lldbdap_testcase
 
 class TestDAP_step(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     def test_step(self):
         """
         Tests the stepping in/out/over in threads.
diff --git a/lldb/test/API/tools/lldb-dap/stop-hooks/TestDAP_stop_hooks.py b/lldb/test/API/tools/lldb-dap/stop-hooks/TestDAP_stop_hooks.py
index c538e8002a03..70c11a63a79f 100644
--- a/lldb/test/API/tools/lldb-dap/stop-hooks/TestDAP_stop_hooks.py
+++ b/lldb/test/API/tools/lldb-dap/stop-hooks/TestDAP_stop_hooks.py
@@ -9,7 +9,6 @@ import lldbdap_testcase
 
 
 class TestDAP_stop_hooks(lldbdap_testcase.DAPTestCaseBase):
-    @skipIfRemote
     def test_stop_hooks_before_run(self):
         """
         Test that there is no race condition between lldb-dap and
diff --git a/lldb/test/API/tools/lldb-dap/terminated-event/TestDAP_terminatedEvent.py b/lldb/test/API/tools/lldb-dap/terminated-event/TestDAP_terminatedEvent.py
index ff5081a41424..6d1c25e8e453 100644
--- a/lldb/test/API/tools/lldb-dap/terminated-event/TestDAP_terminatedEvent.py
+++ b/lldb/test/API/tools/lldb-dap/terminated-event/TestDAP_terminatedEvent.py
@@ -13,7 +13,6 @@ import json
 
 class TestDAP_terminatedEvent(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     def test_terminated_event(self):
         """
         Terminated Event
diff --git a/lldb/test/API/tools/lldb-dap/threads/TestDAP_threads.py b/lldb/test/API/tools/lldb-dap/threads/TestDAP_threads.py
index f7f1ad7a3d50..6edb4b8e2a81 100644
--- a/lldb/test/API/tools/lldb-dap/threads/TestDAP_threads.py
+++ b/lldb/test/API/tools/lldb-dap/threads/TestDAP_threads.py
@@ -10,7 +10,6 @@ import lldbdap_testcase
 
 class TestDAP_threads(lldbdap_testcase.DAPTestCaseBase):
     @skipIfWindows
-    @skipIfRemote
     def test_correct_thread(self):
         """
         Tests that the correct thread is selected if we continue from
@@ -45,7 +44,6 @@ class TestDAP_threads(lldbdap_testcase.DAPTestCaseBase):
         self.assertTrue(stopped_event[0]["body"]["threadCausedFocus"])
 
     @skipIfWindows
-    @skipIfRemote
     def test_thread_format(self):
         """
         Tests the support for custom thread formats.
diff --git a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
index 57c17e5ea9d3..3c6901b2fd99 100644
--- a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
+++ b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
@@ -394,14 +394,12 @@ class TestDAP_variables(lldbdap_testcase.DAPTestCaseBase):
         self.verify_variables(verify_locals, locals)
 
     @skipIfWindows
-    @skipIfRemote
     def test_scopes_variables_setVariable_evaluate(self):
         self.do_test_scopes_variables_setVariable_evaluate(
             enableAutoVariableSummaries=False
         )
 
     @skipIfWindows
-    @skipIfRemote
     def test_scopes_variables_setVariable_evaluate_with_descriptive_summaries(self):
         self.do_test_scopes_variables_setVariable_evaluate(
             enableAutoVariableSummaries=True
@@ -603,12 +601,10 @@ class TestDAP_variables(lldbdap_testcase.DAPTestCaseBase):
                 self.assertEqual(scope.get("presentationHint"), "registers")
 
     @skipIfWindows
-    @skipIfRemote
     def test_scopes_and_evaluate_expansion(self):
         self.do_test_scopes_and_evaluate_expansion(enableAutoVariableSummaries=False)
 
     @skipIfWindows
-    @skipIfRemote
     def test_scopes_and_evaluate_expansion_with_descriptive_summaries(self):
         self.do_test_scopes_and_evaluate_expansion(enableAutoVariableSummaries=True)
 
@@ -664,17 +660,14 @@ class TestDAP_variables(lldbdap_testcase.DAPTestCaseBase):
         self.verify_variables(verify_children, children)
 
     @skipIfWindows
-    @skipIfRemote
     def test_indexedVariables(self):
         self.do_test_indexedVariables(enableSyntheticChildDebugging=False)
 
     @skipIfWindows
-    @skipIfRemote
     def test_indexedVariables_with_raw_child_for_synthetics(self):
         self.do_test_indexedVariables(enableSyntheticChildDebugging=True)
 
     @skipIfWindows
-    @skipIfRemote
     def test_registers(self):
         """
         Test that registers whose byte size is the size of a pointer on
@@ -748,7 +741,6 @@ class TestDAP_variables(lldbdap_testcase.DAPTestCaseBase):
 
     @no_debug_info_test
     @skipIfWindows
-    @skipIfRemote
     def test_value_format(self):
         """
         Test that toggle variables value format between decimal and hexical works.
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/invalid_abbreviation.s b/lldb/test/Shell/SymbolFile/DWARF/x86/invalid_abbreviation.s
new file mode 100644
index 000000000000..3f32c037aeb2
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/invalid_abbreviation.s
@@ -0,0 +1,47 @@
+# REQUIRES: x86
+
+# RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj %s > %t
+# RUN: %lldb %t \
+# RUN:   -o exit 2>&1 | FileCheck %s
+
+# CHECK-DAG: error: {{.*}} [0x0000000000000022]: abbreviation code 65536 too big, please file a bug and attach the file at the start of this error message
+# CHECK-DAG: error: {{.*}} [0x0000000000000048]: invalid abbreviation code 47, please file a bug and attach the file at the start of this error message
+
+
+        .section        .debug_abbrev,"",@progbits
+        .uleb128 65535                  # Largest representable Abbreviation Code
+        .byte   17                      # DW_TAG_compile_unit
+        .byte   1                       # DW_CHILDREN_yes
+        .byte   37                      # DW_AT_producer
+        .byte   8                       # DW_FORM_string
+        .byte   0                       # EOM(1)
+        .byte   0                       # EOM(2)
+        .byte   0                       # EOM(3)
+
+        .section        .debug_info,"",@progbits
+.Lcu_begin0:
+        .long   .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+        .short  5                       # DWARF version number
+        .byte   1                       # DWARF Unit Type
+        .byte   8                       # Address Size (in bytes)
+        .long   .debug_abbrev           # Offset Into Abbrev. Section
+        .uleb128 65535                  # DW_TAG_compile_unit
+        .asciz  "Hand-written DWARF"    # DW_AT_producer
+        .uleb128 65536                  # Unrepresentable abbreviation
+        .byte   0                       # End Of Children Mark
+.Ldebug_info_end0:
+
+        .section        .debug_info,"",@progbits
+.Lcu_begin1:
+        .long   .Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+        .short  5                       # DWARF version number
+        .byte   1                       # DWARF Unit Type
+        .byte   8                       # Address Size (in bytes)
+        .long   .debug_abbrev           # Offset Into Abbrev. Section
+        .uleb128 65535                  # DW_TAG_compile_unit
+        .asciz  "Hand-written DWARF"    # DW_AT_producer
+        .byte   47                      # Missing abbreviation
+        .byte   0                       # End Of Children Mark
+.Ldebug_info_end1:
diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h
index bbd9d46ba3a0..a88ee3e1dec6 100644
--- a/lldb/tools/lldb-dap/DAP.h
+++ b/lldb/tools/lldb-dap/DAP.h
@@ -26,6 +26,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/JSON.h"
+#include "llvm/Support/Threading.h"
 #include "llvm/Support/raw_ostream.h"
 
 #include "lldb/API/SBAttachInfo.h"
@@ -169,6 +170,7 @@ struct DAP {
   std::optional<llvm::json::Object> last_launch_or_attach_request;
   lldb::tid_t focus_tid;
   bool disconnecting = false;
+  llvm::once_flag terminated_event_flag;
   bool stop_at_entry;
   bool is_attach;
   bool enable_auto_variable_summaries;
diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index 170fa88f1e8b..7746afb6cbbf 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -227,13 +227,12 @@ void SendContinuedEvent() {
 // debugged.
 void SendTerminatedEvent() {
   // Prevent races if the process exits while we're being asked to disconnect.
-  static std::mutex mutex;
-  std::lock_guard<std::mutex> locker(mutex);
-
-  g_dap.RunTerminateCommands();
-  // Send a "terminated" event
-  llvm::json::Object event(CreateTerminatedEventObject());
-  g_dap.SendJSON(llvm::json::Value(std::move(event)));
+  llvm::call_once(g_dap.terminated_event_flag, [&] {
+    g_dap.RunTerminateCommands();
+    // Send a "terminated" event
+    llvm::json::Object event(CreateTerminatedEventObject());
+    g_dap.SendJSON(llvm::json::Value(std::move(event)));
+  });
 }
 
 // Send a thread stopped event for all threads as long as the process
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index f3aeafac6fab..612e90abd409 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -539,8 +539,6 @@ set(FFI_INCLUDE_DIR "" CACHE PATH "Additional directory, where CMake should sear
 set(LLVM_TARGET_ARCH "host"
   CACHE STRING "Set target to use for LLVM JIT or use \"host\" for automatic detection.")
 
-option(LLVM_ENABLE_TERMINFO "Use terminfo database if available." ON)
-
 set(LLVM_ENABLE_LIBXML2 "ON" CACHE STRING "Use libxml2 if available. Can be ON, OFF, or FORCE_ON")
 
 option(LLVM_ENABLE_LIBEDIT "Use libedit if available." ON)
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index bf1b110245bb..8cfb36b0194e 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -240,21 +240,11 @@ if(NOT LLVM_USE_SANITIZER MATCHES "Memory.*")
     else()
       set(HAVE_LIBEDIT 0)
     endif()
-    if(LLVM_ENABLE_TERMINFO)
-      if(LLVM_ENABLE_TERMINFO STREQUAL FORCE_ON)
-        find_package(Terminfo REQUIRED)
-      else()
-        find_package(Terminfo)
-      endif()
-      set(LLVM_ENABLE_TERMINFO "${Terminfo_FOUND}")
-    endif()
   else()
     set(HAVE_LIBEDIT 0)
-    set(LLVM_ENABLE_TERMINFO 0)
   endif()
 else()
   set(HAVE_LIBEDIT 0)
-  set(LLVM_ENABLE_TERMINFO 0)
 endif()
 
 # function checks
@@ -415,15 +405,18 @@ if( LLVM_ENABLE_PIC )
   set(ENABLE_PIC 1)
 else()
   set(ENABLE_PIC 0)
-  check_cxx_compiler_flag("-fno-pie" SUPPORTS_NO_PIE_FLAG)
-  if(SUPPORTS_NO_PIE_FLAG)
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fno-pie")
-  endif()
+  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fno-pie")
 endif()
 
-check_cxx_compiler_flag("-Wvariadic-macros" SUPPORTS_VARIADIC_MACROS_FLAG)
-check_cxx_compiler_flag("-Wgnu-zero-variadic-macro-arguments"
-                        SUPPORTS_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG)
+set(SUPPORTS_VARIADIC_MACROS_FLAG 0)
+if (LLVM_COMPILER_IS_GCC_COMPATIBLE)
+  set(SUPPORTS_VARIADIC_MACROS_FLAG 1)
+endif()
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  set(SUPPORTS_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG 1)
+else()
+  set(SUPPORTS_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG 0)
+endif()
 
 set(USE_NO_MAYBE_UNINITIALIZED 0)
 set(USE_NO_UNINITIALIZED 0)
@@ -433,11 +426,9 @@ set(USE_NO_UNINITIALIZED 0)
 if (CMAKE_COMPILER_IS_GNUCXX)
   # Disable all -Wuninitialized warning for old GCC versions.
   if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12.0)
-    check_cxx_compiler_flag("-Wuninitialized" HAS_UNINITIALIZED)
-    set(USE_NO_UNINITIALIZED ${HAS_UNINITIALIZED})
+    set(USE_NO_UNINITIALIZED 1)
   else()
-    check_cxx_compiler_flag("-Wmaybe-uninitialized" HAS_MAYBE_UNINITIALIZED)
-    set(USE_NO_MAYBE_UNINITIALIZED ${HAS_MAYBE_UNINITIALIZED})
+    set(USE_NO_MAYBE_UNINITIALIZED 1)
   endif()
 endif()
 
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index ecbae8a32f1a..03f4e1f190fd 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -276,11 +276,11 @@ if (NOT DEFINED LLVM_LINKER_DETECTED AND NOT WIN32)
 
   if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
     include(CheckLinkerFlag)
-    # Linkers that support Darwin allow a setting to internalize all symbol exports, 
+    # Linkers that support Darwin allow a setting to internalize all symbol exports,
     # aiding in reducing binary size and often is applicable for executables.
     check_linker_flag(C "-Wl,-no_exported_symbols" LLVM_LINKER_SUPPORTS_NO_EXPORTED_SYMBOLS)
-    
-    if (NOT LLVM_USE_LINKER) 
+
+    if (NOT LLVM_USE_LINKER)
       # Apple's linker complains about duplicate libraries, which CMake likes to do
       # to support ELF platforms. To silence that warning, we can use
       # -no_warn_duplicate_libraries, but only in versions of the linker that
@@ -289,8 +289,8 @@ if (NOT DEFINED LLVM_LINKER_DETECTED AND NOT WIN32)
     else()
       set(LLVM_LINKER_SUPPORTS_NO_WARN_DUPLICATE_LIBRARIES OFF CACHE INTERNAL "")
     endif()
-  
-  else() 
+
+  else()
     set(LLVM_LINKER_SUPPORTS_NO_EXPORTED_SYMBOLS OFF CACHE INTERNAL "")
   endif()
 endif()
@@ -1069,7 +1069,7 @@ macro(add_llvm_executable name)
     add_llvm_symbol_exports( ${name} ${LLVM_EXPORTED_SYMBOL_FILE} )
   endif(LLVM_EXPORTED_SYMBOL_FILE)
 
-  if (DEFINED LLVM_ENABLE_EXPORTED_SYMBOLS_IN_EXECUTABLES AND 
+  if (DEFINED LLVM_ENABLE_EXPORTED_SYMBOLS_IN_EXECUTABLES AND
       NOT LLVM_ENABLE_EXPORTED_SYMBOLS_IN_EXECUTABLES)
     if(LLVM_LINKER_SUPPORTS_NO_EXPORTED_SYMBOLS)
       set_property(TARGET ${name} APPEND_STRING PROPERTY
@@ -1677,7 +1677,7 @@ function(add_unittest test_suite test_name)
 
   if (SUPPORTS_VARIADIC_MACROS_FLAG)
     list(APPEND LLVM_COMPILE_FLAGS "-Wno-variadic-macros")
-  endif ()
+  endif()
   # Some parts of gtest rely on this GNU extension, don't warn on it.
   if(SUPPORTS_GNU_ZERO_VARIADIC_MACRO_ARGUMENTS_FLAG)
     list(APPEND LLVM_COMPILE_FLAGS "-Wno-gnu-zero-variadic-macro-arguments")
diff --git a/llvm/cmake/modules/FindTerminfo.cmake b/llvm/cmake/modules/FindTerminfo.cmake
deleted file mode 100644
index 163af6697067..000000000000
--- a/llvm/cmake/modules/FindTerminfo.cmake
+++ /dev/null
@@ -1,55 +0,0 @@
-# Attempts to discover terminfo library with a linkable setupterm function.
-#
-# Example usage:
-#
-# find_package(Terminfo)
-#
-# If successful, the following variables will be defined:
-# Terminfo_FOUND
-# Terminfo_LIBRARIES
-#
-# Additionally, the following import target will be defined:
-# Terminfo::terminfo
-
-find_library(Terminfo_LIBRARIES NAMES terminfo tinfo curses ncurses ncursesw)
-
-if(Terminfo_LIBRARIES)
-  include(CMakePushCheckState)
-  cmake_push_check_state()
-  list(APPEND CMAKE_REQUIRED_LIBRARIES ${Terminfo_LIBRARIES})
-  set(Terminfo_LINKABLE_SRC [=[
-    #ifdef __cplusplus
-    extern "C" {
-    #endif
-    int setupterm(char *term, int filedes, int *errret);
-    #ifdef __cplusplus
-    }
-    #endif
-    int main(void) { return setupterm(0, 0, 0); }
-    ]=])
-  if(DEFINED CMAKE_C_COMPILER)
-    include(CheckCSourceCompiles)
-    check_c_source_compiles("${Terminfo_LINKABLE_SRC}" Terminfo_LINKABLE)
-  else()
-    include(CheckCXXSourceCompiles)
-    check_cxx_source_compiles("${Terminfo_LINKABLE_SRC}" Terminfo_LINKABLE)
-  endif()
-  cmake_pop_check_state()
-endif()
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(Terminfo
-                                  FOUND_VAR
-                                    Terminfo_FOUND
-                                  REQUIRED_VARS
-                                    Terminfo_LIBRARIES
-                                    Terminfo_LINKABLE)
-mark_as_advanced(Terminfo_LIBRARIES
-                 Terminfo_LINKABLE)
-
-if(Terminfo_FOUND)
-  if(NOT TARGET Terminfo::terminfo)
-    add_library(Terminfo::terminfo UNKNOWN IMPORTED)
-    set_target_properties(Terminfo::terminfo PROPERTIES IMPORTED_LOCATION "${Terminfo_LIBRARIES}")
-  endif()
-endif()
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index d16641d83190..99d848ba3d85 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -425,7 +425,7 @@ if( LLVM_ENABLE_PIC )
   # GCC for MIPS can miscompile LLVM due to PR37701.
   if(CMAKE_COMPILER_IS_GNUCXX AND LLVM_NATIVE_ARCH STREQUAL "Mips" AND
          NOT Uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
-    add_flag_or_print_warning("-fno-shrink-wrap" FNO_SHRINK_WRAP)
+    append("-fno-shrink-wrap" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   endif()
   # gcc with -O3 -fPIC generates TLS sequences that violate the spec on
   # Solaris/sparcv9, causing executables created with the system linker
@@ -635,18 +635,16 @@ if( MSVC )
     # This checks CMAKE_CXX_COMPILER_ID in addition to check_cxx_compiler_flag()
     # because cl.exe does not emit an error on flags it doesn't understand,
     # letting check_cxx_compiler_flag() claim it understands all flags.
-    check_cxx_compiler_flag("/Brepro" SUPPORTS_BREPRO)
-    if (SUPPORTS_BREPRO)
-      # Check if /INCREMENTAL is passed to the linker and complain that it
-      # won't work with /Brepro.
-      has_msvc_incremental_no_flag("${CMAKE_EXE_LINKER_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${CMAKE_EXE_LINKER_FLAGS}" NO_INCR_EXE)
-      has_msvc_incremental_no_flag("${CMAKE_MODULE_LINKER_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${CMAKE_MODULE_LINKER_FLAGS}" NO_INCR_MODULE)
-      has_msvc_incremental_no_flag("${CMAKE_SHARED_LINKER_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${CMAKE_SHARED_LINKER_FLAGS}" NO_INCR_SHARED)
-      if (NO_INCR_EXE AND NO_INCR_MODULE AND NO_INCR_SHARED)
-        append("/Brepro" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-      else()
-        message(WARNING "/Brepro not compatible with /INCREMENTAL linking - builds will be non-deterministic")
-      endif()
+
+    # Check if /INCREMENTAL is passed to the linker and complain that it
+    # won't work with /Brepro.
+    has_msvc_incremental_no_flag("${CMAKE_EXE_LINKER_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${CMAKE_EXE_LINKER_FLAGS}" NO_INCR_EXE)
+    has_msvc_incremental_no_flag("${CMAKE_MODULE_LINKER_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${CMAKE_MODULE_LINKER_FLAGS}" NO_INCR_MODULE)
+    has_msvc_incremental_no_flag("${CMAKE_SHARED_LINKER_FLAGS_${uppercase_CMAKE_BUILD_TYPE}} ${CMAKE_SHARED_LINKER_FLAGS}" NO_INCR_SHARED)
+    if (NO_INCR_EXE AND NO_INCR_MODULE AND NO_INCR_SHARED)
+      append("/Brepro" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+    else()
+      message(WARNING "/Brepro not compatible with /INCREMENTAL linking - builds will be non-deterministic")
     endif()
   endif()
   # By default MSVC has a 2^16 limit on the number of sections in an object file,
@@ -667,19 +665,22 @@ endif( LLVM_COMPILER_IS_GCC_COMPATIBLE )
 
 # Specific default warnings-as-errors for compilers accepting GCC-compatible warning flags:
 if ( LLVM_COMPILER_IS_GCC_COMPATIBLE OR CMAKE_CXX_COMPILER_ID MATCHES "XL" )
-  add_flag_if_supported("-Werror=date-time" WERROR_DATE_TIME)
-  add_flag_if_supported("-Werror=unguarded-availability-new" WERROR_UNGUARDED_AVAILABILITY_NEW)
+  append("-Werror=date-time" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 endif( LLVM_COMPILER_IS_GCC_COMPATIBLE OR CMAKE_CXX_COMPILER_ID MATCHES "XL" )
 
-if ( LLVM_COMPILER_IS_GCC_COMPATIBLE )
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  append("-Werror=unguarded-availability-new" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+endif()
+
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GCC")
   # LLVM data structures like llvm::User and llvm::MDNode rely on
   # the value of object storage persisting beyond the lifetime of the
   # object (#24952).  This is not standard compliant and causes a runtime
   # crash if LLVM is built with GCC and LTO enabled (#57740).  Until
   # these bugs are fixed, we need to disable dead store eliminations
   # based on object lifetime.
-  add_flag_if_supported("-fno-lifetime-dse" CMAKE_CXX_FLAGS)
-endif ( LLVM_COMPILER_IS_GCC_COMPATIBLE )
+  append("-fno-lifetime-dse" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+endif ()
 
 # Modules enablement for GCC-compatible compilers:
 if ( LLVM_COMPILER_IS_GCC_COMPATIBLE AND LLVM_ENABLE_MODULES )
@@ -697,22 +698,7 @@ if ( LLVM_COMPILER_IS_GCC_COMPATIBLE AND LLVM_ENABLE_MODULES )
        (uppercase_CMAKE_BUILD_TYPE STREQUAL "RELWITHDEBINFO")))
     set(module_flags "${module_flags} -gmodules")
   endif()
-  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${module_flags}")
-
-  # Check that we can build code with modules enabled, and that repeatedly
-  # including <cassert> still manages to respect NDEBUG properly.
-  CHECK_CXX_SOURCE_COMPILES("#undef NDEBUG
-                             #include <cassert>
-                             #define NDEBUG
-                             #include <cassert>
-                             int main() { assert(this code is not compiled); }"
-                             CXX_SUPPORTS_MODULES)
-  set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
-  if (CXX_SUPPORTS_MODULES)
-    append("${module_flags}" CMAKE_CXX_FLAGS)
-  else()
-    message(FATAL_ERROR "LLVM_ENABLE_MODULES is not supported by this compiler")
-  endif()
+  append("${module_flags}" CMAKE_CXX_FLAGS)
 endif( LLVM_COMPILER_IS_GCC_COMPATIBLE AND LLVM_ENABLE_MODULES )
 
 if (MSVC)
@@ -814,13 +800,10 @@ if (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
   # Turn off missing field initializer warnings for gcc to avoid noise from
   # false positives with empty {}. Turn them on otherwise (they're off by
   # default for clang).
-  check_cxx_compiler_flag("-Wmissing-field-initializers" CXX_SUPPORTS_MISSING_FIELD_INITIALIZERS_FLAG)
-  if (CXX_SUPPORTS_MISSING_FIELD_INITIALIZERS_FLAG)
-    if (CMAKE_COMPILER_IS_GNUCXX)
-      append("-Wno-missing-field-initializers" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-    else()
-      append("-Wmissing-field-initializers" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-    endif()
+  if (CMAKE_COMPILER_IS_GNUCXX)
+    append("-Wno-missing-field-initializers" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  else()
+    append("-Wmissing-field-initializers" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   endif()
 
   if (LLVM_ENABLE_PEDANTIC AND LLVM_COMPILER_IS_GCC_COMPATIBLE)
@@ -833,8 +816,13 @@ if (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
     add_flag_if_supported("-Wc++98-compat-extra-semi" CXX98_COMPAT_EXTRA_SEMI_FLAG)
   endif()
 
-  add_flag_if_supported("-Wimplicit-fallthrough" IMPLICIT_FALLTHROUGH_FLAG)
-  add_flag_if_supported("-Wcovered-switch-default" COVERED_SWITCH_DEFAULT_FLAG)
+  append("-Wimplicit-fallthrough" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+
+  set(CXX_SUPPORTS_COVERED_SWITCH_DEFAULT_FLAG 0)
+  if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    set(CXX_SUPPORTS_COVERED_SWITCH_DEFAULT_FLAG 1)
+    append("-Wcovered-switch-default" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  endif()
   append_if(USE_NO_UNINITIALIZED "-Wno-uninitialized" CMAKE_CXX_FLAGS)
   append_if(USE_NO_MAYBE_UNINITIALIZED "-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS)
 
@@ -845,38 +833,32 @@ if (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
 
   # Disable -Wclass-memaccess, a C++-only warning from GCC 8 that fires on
   # LLVM's ADT classes.
-  check_cxx_compiler_flag("-Wclass-memaccess" CXX_SUPPORTS_CLASS_MEMACCESS_FLAG)
-  append_if(CXX_SUPPORTS_CLASS_MEMACCESS_FLAG "-Wno-class-memaccess" CMAKE_CXX_FLAGS)
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1)
+      append("-Wno-class-memaccess" CMAKE_CXX_FLAGS)
+    endif()
+  endif()
 
   # Disable -Wredundant-move and -Wpessimizing-move on GCC>=9. GCC wants to
-  # remove std::move in code like "A foo(ConvertibleToA a) {
-  # return std::move(a); }", but this code does not compile (or uses the copy
+  # remove std::move in code like
+  # "A foo(ConvertibleToA a) { return std::move(a); }",
+  # but this code does not compile (or uses the copy
   # constructor instead) on clang<=3.8. Clang also has a -Wredundant-move and
   # -Wpessimizing-move, but they only fire when the types match exactly, so we
   # can keep them here.
   if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-    check_cxx_compiler_flag("-Wredundant-move" CXX_SUPPORTS_REDUNDANT_MOVE_FLAG)
-    append_if(CXX_SUPPORTS_REDUNDANT_MOVE_FLAG "-Wno-redundant-move" CMAKE_CXX_FLAGS)
-    check_cxx_compiler_flag("-Wpessimizing-move" CXX_SUPPORTS_PESSIMIZING_MOVE_FLAG)
-    append_if(CXX_SUPPORTS_PESSIMIZING_MOVE_FLAG "-Wno-pessimizing-move" CMAKE_CXX_FLAGS)
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 9.1)
+      append("-Wno-redundant-move" CMAKE_CXX_FLAGS)
+      append("-Wno-pessimizing-move" CMAKE_CXX_FLAGS)
+    endif()
   endif()
 
   # The LLVM libraries have no stable C++ API, so -Wnoexcept-type is not useful.
-  check_cxx_compiler_flag("-Wnoexcept-type" CXX_SUPPORTS_NOEXCEPT_TYPE_FLAG)
-  append_if(CXX_SUPPORTS_NOEXCEPT_TYPE_FLAG "-Wno-noexcept-type" CMAKE_CXX_FLAGS)
-
-  # Check if -Wnon-virtual-dtor warns for a class marked final, when it has a
-  # friend declaration. If it does, don't add -Wnon-virtual-dtor. The case is
-  # considered unhelpful (https://gcc.gnu.org/PR102168).
-  set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
-  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror=non-virtual-dtor")
-  CHECK_CXX_SOURCE_COMPILES("class f {};
-                             class base {friend f; public: virtual void anchor();protected: ~base();};
-                             int main() { return 0; }"
-                            CXX_WONT_WARN_ON_FINAL_NONVIRTUALDTOR)
-  set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
-  append_if(CXX_WONT_WARN_ON_FINAL_NONVIRTUALDTOR "-Wnon-virtual-dtor" CMAKE_CXX_FLAGS)
+  append("-Wno-noexcept-type" CMAKE_CXX_FLAGS)
 
+  if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    append("-Wnon-virtual-dtor" CMAKE_CXX_FLAGS)
+  endif()
   append("-Wdelete-non-virtual-dtor" CMAKE_CXX_FLAGS)
 
   # Enable -Wsuggest-override if it's available, and only if it doesn't
@@ -906,14 +888,15 @@ if (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
   endif()
 
   # Enable -Wstring-conversion to catch misuse of string literals.
-  add_flag_if_supported("-Wstring-conversion" STRING_CONVERSION_FLAG)
+  if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    append("-Wstring-conversion" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  endif()
 
   if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
     # Disable the misleading indentation warning with GCC; GCC can
     # produce noisy notes about this getting disabled in large files.
     # See e.g. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89549
-    check_cxx_compiler_flag("-Wmisleading-indentation" CXX_SUPPORTS_MISLEADING_INDENTATION_FLAG)
-    append_if(CXX_SUPPORTS_MISLEADING_INDENTATION_FLAG "-Wno-misleading-indentation" CMAKE_CXX_FLAGS)
+    append("-Wno-misleading-indentation" CMAKE_CXX_FLAGS)
   else()
     # Prevent bugs that can happen with llvm's brace style.
     add_flag_if_supported("-Wmisleading-indentation" MISLEADING_INDENTATION_FLAG)
@@ -931,14 +914,15 @@ macro(append_common_sanitizer_flags)
   if (NOT MSVC OR CLANG_CL)
     # Append -fno-omit-frame-pointer and turn on debug info to get better
     # stack traces.
-    add_flag_if_supported("-fno-omit-frame-pointer" FNO_OMIT_FRAME_POINTER)
+    append("-fno-omit-frame-pointer" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     if (NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" AND
-        NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "RELWITHDEBINFO")
-      add_flag_if_supported("-gline-tables-only" GLINE_TABLES_ONLY)
+        NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "RELWITHDEBINFO" AND
+        CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      append("-gline-tables-only" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     endif()
     # Use -O1 even in debug mode, otherwise sanitizers slowdown is too large.
     if (uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" AND LLVM_OPTIMIZE_SANITIZED_BUILDS)
-      add_flag_if_supported("-O1" O1)
+      append("-O1" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     endif()
   else()
     # Always ask the linker to produce symbols with asan.
@@ -1112,15 +1096,12 @@ endif()
 if(NOT CYGWIN AND NOT MSVC)
   if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin" AND
      NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
-    check_c_compiler_flag("-Werror -fno-function-sections" C_SUPPORTS_FNO_FUNCTION_SECTIONS)
-    if (C_SUPPORTS_FNO_FUNCTION_SECTIONS)
-      # Don't add -ffunction-sections if it can't be disabled with -fno-function-sections.
-      # Doing so will break sanitizers.
-      add_flag_if_supported("-ffunction-sections" FFUNCTION_SECTIONS)
-    elseif (CMAKE_CXX_COMPILER_ID MATCHES "XL")
+    if (CMAKE_CXX_COMPILER_ID MATCHES "XL")
       append("-qfuncsect" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+    else()
+      append("-ffunction-sections" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     endif()
-    add_flag_if_supported("-fdata-sections" FDATA_SECTIONS)
+    append("-fdata-sections" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   endif()
 elseif(MSVC)
   if( NOT uppercase_CMAKE_BUILD_TYPE STREQUAL "DEBUG" )
@@ -1385,7 +1366,9 @@ if(LLVM_USE_RELATIVE_PATHS_IN_DEBUG_INFO)
   file(RELATIVE_PATH relative_root "${CMAKE_BINARY_DIR}" "${source_root}")
   append_if(SUPPORTS_FDEBUG_PREFIX_MAP "-fdebug-prefix-map=${CMAKE_BINARY_DIR}=${relative_root}" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   append_if(SUPPORTS_FDEBUG_PREFIX_MAP "-fdebug-prefix-map=${source_root}/=${LLVM_SOURCE_PREFIX}" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-  add_flag_if_supported("-no-canonical-prefixes" NO_CANONICAL_PREFIXES)
+  if (LLVM_COMPILER_IS_GCC_COMPATIBLE)
+    append("-no-canonical-prefixes" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  endif()
 endif()
 
 option(LLVM_USE_RELATIVE_PATHS_IN_FILES "Use relative paths in sources and debug info" OFF)
@@ -1400,7 +1383,9 @@ if(LLVM_USE_RELATIVE_PATHS_IN_FILES)
   file(RELATIVE_PATH relative_root "${CMAKE_BINARY_DIR}" "${source_root}")
   append_if(SUPPORTS_FFILE_PREFIX_MAP "-ffile-prefix-map=${CMAKE_BINARY_DIR}=${relative_root}" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   append_if(SUPPORTS_FFILE_PREFIX_MAP "-ffile-prefix-map=${source_root}/=${LLVM_SOURCE_PREFIX}" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-  add_flag_if_supported("-no-canonical-prefixes" NO_CANONICAL_PREFIXES)
+  if (LLVM_COMPILER_IS_GCC_COMPATIBLE)
+    append("-no-canonical-prefixes" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  endif()
 endif()
 
 set(LLVM_THIRD_PARTY_DIR  ${CMAKE_CURRENT_SOURCE_DIR}/../third-party CACHE STRING
diff --git a/llvm/cmake/modules/LLVMConfig.cmake.in b/llvm/cmake/modules/LLVMConfig.cmake.in
index 397bd5815b64..7e1501a89354 100644
--- a/llvm/cmake/modules/LLVMConfig.cmake.in
+++ b/llvm/cmake/modules/LLVMConfig.cmake.in
@@ -60,11 +60,6 @@ if(LLVM_ENABLE_LIBEDIT)
   find_package(LibEdit)
 endif()
 
-set(LLVM_ENABLE_TERMINFO @LLVM_ENABLE_TERMINFO@)
-if(LLVM_ENABLE_TERMINFO)
-  find_package(Terminfo)
-endif()
-
 set(LLVM_ENABLE_THREADS @LLVM_ENABLE_THREADS@)
 
 set(LLVM_ENABLE_UNWIND_TABLES @LLVM_ENABLE_UNWIND_TABLES@)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 75536bc5bea6..1004956ac8f1 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1970,6 +1970,8 @@ The AMDGPU backend uses the following ELF header:
      ``EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC``  0x053      ``gfx10-3-generic``
      ``EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC``    0x054      ``gfx11-generic``
      *reserved*                                 0x055      Reserved.
+     *reserved*                                 0x056      Reserved.
+     *reserved*                                 0x057      Reserved.
      ========================================== ========== =============================
 
 Sections
diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index 3588ef14db15..646f1d09dfab 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -349,6 +349,11 @@ The :doc:`CodeOfConduct` applies to all office hours.
     - Every two weeks, Wednesdays at 2:00pm US Pacific, for 90 minutes.
     - Livestream chat or `Google meet <https://meet.google.com/wit-tvzc-dwc>`__
     - English
+  * - Renato Golin
+    - General LLVM, MLIR & Linalg, distributed computing, research, socials.
+    - Every first Tuesday of the month, 11:00am UK time, for 60 minutes.
+    - `Google meet <https://meet.google.com/esg-fggc-hfe>`__
+    - English, Portuguese
   * - Rotating hosts
     - Getting Started, beginner questions, new contributors.
     - Every Tuesday at 2 PM ET (11 AM PT), for 30 minutes.
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 358eb4b86792..d2d21c7c4b5e 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -15761,8 +15761,8 @@ The arguments and return value are floating-point numbers of the same type.
 Semantics:
 """"""""""
 
-Return the same value as a corresponding libm '``fma``' function but without
-trapping or setting ``errno``.
+Return the same value as the IEEE-754 fusedMultiplyAdd operation. This
+is assumed to not trap or set ``errno``.
 
 When specified with the fast-math-flag 'afn', the result may be approximated
 using a less accurate calculation.
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index cba36c7177da..a495e6cb1706 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -62,6 +62,10 @@ Changes to LLVM infrastructure
 Changes to building LLVM
 ------------------------
 
+- The ``LLVM_ENABLE_TERMINFO`` flag has been removed. LLVM no longer depends on
+  terminfo and now always uses the ``TERM`` environment variable for color
+  support autodetection.
+
 Changes to TableGen
 -------------------
 
@@ -130,6 +134,7 @@ Changes to the RISC-V Backend
   match GNU objdump. The bytes within the groups are in big endian order.
 * Added smstateen extension to -march. CSR names for smstateen were already supported.
 * Zaamo and Zalrsc are no longer experimental.
+* Processors that enable post reg-alloc scheduling (PostMachineScheduler) by default should use the `UsePostRAScheduler` subtarget feature. Setting `PostRAScheduler = 1` in the scheduler model will have no effect on the enabling of the PostMachineScheduler.
 
 Changes to the WebAssembly Backend
 ----------------------------------
@@ -140,6 +145,9 @@ Changes to the Windows Target
 Changes to the X86 Backend
 --------------------------
 
+- Removed knl/knm specific ISA intrinsics: AVX512PF, AVX512ER, PREFETCHWT1,
+  while assembly encoding/decoding supports are kept.
+
 Changes to the OCaml bindings
 -----------------------------
 
@@ -239,6 +247,11 @@ Changes to the LLVM tools
   documented in `--help` output and the command guide. (`#90474
   <https://github.com/llvm/llvm-project/pull/90474>`)
 
+* llvm-readobj's LLVM output format for ELF core files has been changed.
+  Similarly, the JSON format has been fixed for this case. The NT_FILE note
+  now has a map for the mapped files. (`#92835
+  <https://github.com/llvm/llvm-project/pull/92835>`).
+
 Changes to LLDB
 ---------------------------------
 
diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst
index d27177a4541a..657b0fb9b672 100644
--- a/llvm/docs/SPIRVUsage.rst
+++ b/llvm/docs/SPIRVUsage.rst
@@ -143,6 +143,8 @@ list of supported SPIR-V extensions, sorted alphabetically by their extension na
      - Adds instructions to convert between single-precision 32-bit floating-point values and 16-bit bfloat16 values.
    * - ``SPV_INTEL_function_pointers``
      - Allows translation of function pointers.
+   * - ``SPV_INTEL_inline_assembly``
+     - Allows to use inline assembly.
    * - ``SPV_INTEL_optnone``
      - Adds OptNoneINTEL value for Function Control mask that indicates a request to not optimize the function.
    * - ``SPV_INTEL_subgroups``
@@ -161,6 +163,8 @@ list of supported SPIR-V extensions, sorted alphabetically by their extension na
      - Allows to use the LinkOnceODR linkage type that lets a function or global variable to be merged with other functions or global variables of the same name when linkage occurs.
    * - ``SPV_KHR_no_integer_wrap_decoration``
      - Adds decorations to indicate that a given instruction does not cause integer wrapping.
+   * - ``SPV_KHR_shader_clock``
+     - Adds the extension cl_khr_kernel_clock that adds the ability for a kernel to sample the value from clocks provided by compute units.
    * - ``SPV_KHR_subgroup_rotate``
      - Adds a new instruction that enables rotating values across invocations within a subgroup.
    * - ``SPV_KHR_uniform_group_instructions``
@@ -333,6 +337,10 @@ SPIR-V backend, along with their descriptions and argument details.
      - 32-bit Integer
      - `[]`
      - Generates an undefined value. Useful for optimizations and indicating uninitialized variables.
+   * - `int_spv_inline_asm`
+     - None
+     - `[Metadata, Metadata, Vararg]`
+     - Associates inline assembly features to inline assembly call instances by creating metadatas and preserving original arguments. Not emitted directly but used to support SPIR-V representation in LLVM IR.
    * - `int_spv_assume`
      - None
      - `[1-bit Integer]`
diff --git a/llvm/include/llvm/Analysis/CFG.h b/llvm/include/llvm/Analysis/CFG.h
index 86b01c13274f..23bc10a4a9d1 100644
--- a/llvm/include/llvm/Analysis/CFG.h
+++ b/llvm/include/llvm/Analysis/CFG.h
@@ -96,6 +96,18 @@ bool isPotentiallyReachableFromMany(
     const SmallPtrSetImpl<BasicBlock *> *ExclusionSet,
     const DominatorTree *DT = nullptr, const LoopInfo *LI = nullptr);
 
+/// Determine whether there is a potentially a path from at least one block in
+/// 'Worklist' to at least one block in 'StopSet' within a single function
+/// without passing through any of the blocks in 'ExclusionSet'. Returns false
+/// only if we can prove that once any block in 'Worklist' has been reached then
+/// no blocks in 'StopSet' can be executed without passing through any blocks in
+/// 'ExclusionSet'. Conservatively returns true.
+bool isManyPotentiallyReachableFromMany(
+    SmallVectorImpl<BasicBlock *> &Worklist,
+    const SmallPtrSetImpl<const BasicBlock *> &StopSet,
+    const SmallPtrSetImpl<BasicBlock *> *ExclusionSet,
+    const DominatorTree *DT = nullptr, const LoopInfo *LI = nullptr);
+
 /// Return true if the control flow in \p RPOTraversal is irreducible.
 ///
 /// This is a generic implementation to detect CFG irreducibility based on loop
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 0c3a6b3742c7..cefce93f9e25 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1397,7 +1397,7 @@ public:
   InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
                                             int VF,
                                             const APInt &DemandedDstElts,
-                                            TTI::TargetCostKind CostKind);
+                                            TTI::TargetCostKind CostKind) const;
 
   /// \return The cost of Load and Store instructions.
   InstructionCost
diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
index 10f1333cf888..e12eb7095b90 100644
--- a/llvm/include/llvm/Analysis/VecFuncs.def
+++ b/llvm/include/llvm/Analysis/VecFuncs.def
@@ -12,11 +12,6 @@
 // This .def file also allows creating an array of vector functions supported in
 // the specified framework or library.
 
-#if defined(TLI_DEFINE_MASSV_VECFUNCS_NAMES)
-#define TLI_DEFINE_MASSV_VECFUNCS
-#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, VABI_PREFIX) VEC,
-#endif
-
 #define FIXED(NL) ElementCount::getFixed(NL)
 #define SCALABLE(NL) ElementCount::getScalable(NL)
 #define NOMASK false
@@ -1276,14 +1271,3 @@ TLI_DEFINE_VECFUNC("cbrtf", "amd_vrs4_cbrtf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 #undef FIXED
 
 #undef TLI_DEFINE_VECFUNC
-#undef TLI_DEFINE_ACCELERATE_VECFUNCS
-#undef TLI_DEFINE_DARWIN_LIBSYSTEM_M_VECFUNCS
-#undef TLI_DEFINE_LIBMVEC_X86_VECFUNCS
-#undef TLI_DEFINE_MASSV_VECFUNCS
-#undef TLI_DEFINE_SVML_VECFUNCS
-#undef TLI_DEFINE_SLEEFGNUABI_VF2_VECFUNCS
-#undef TLI_DEFINE_SLEEFGNUABI_VF4_VECFUNCS
-#undef TLI_DEFINE_SLEEFGNUABI_SCALABLE_VECFUNCS
-#undef TLI_DEFINE_MASSV_VECFUNCS_NAMES
-#undef TLI_DEFINE_ARMPL_VECFUNCS
-#undef TLI_DEFINE_AMDLIBM_VECFUNCS
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index f296acc2ca4b..67cacaed2e12 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -796,11 +796,13 @@ enum : unsigned {
   EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC   = 0x053,
   EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC     = 0x054,
   EF_AMDGPU_MACH_AMDGCN_RESERVED_0X55 = 0x055,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X56 = 0x056,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X57 = 0x057,
   // clang-format on
 
   // First/last AMDGCN-based processors.
   EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
-  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC,
+  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_RESERVED_0X57,
 
   // Indicates if the "xnack" target feature is enabled for all code contained
   // in the object.
diff --git a/llvm/include/llvm/Bitcode/BitcodeWriter.h b/llvm/include/llvm/Bitcode/BitcodeWriter.h
index 248d33f4502e..a343f0e05763 100644
--- a/llvm/include/llvm/Bitcode/BitcodeWriter.h
+++ b/llvm/include/llvm/Bitcode/BitcodeWriter.h
@@ -102,7 +102,8 @@ class raw_ostream;
 
     void writeIndex(
         const ModuleSummaryIndex *Index,
-        const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex);
+        const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex,
+        const GVSummaryPtrSet *DecSummaries);
   };
 
   /// Write the specified module to the specified raw output stream.
@@ -147,10 +148,12 @@ class raw_ostream;
   /// where it will be written in a new bitcode block. This is used when
   /// writing the combined index file for ThinLTO. When writing a subset of the
   /// index for a distributed backend, provide the \p ModuleToSummariesForIndex
-  /// map.
+  /// map. \p DecSummaries specifies the set of summaries for which the
+  /// corresponding value should be imported as a declaration (prototype).
   void writeIndexToFile(const ModuleSummaryIndex &Index, raw_ostream &Out,
                         const std::map<std::string, GVSummaryMapTy>
-                            *ModuleToSummariesForIndex = nullptr);
+                            *ModuleToSummariesForIndex = nullptr,
+                        const GVSummaryPtrSet *DecSummaries = nullptr);
 
   /// If EmbedBitcode is set, save a copy of the llvm IR as data in the
   ///  __LLVM,__bitcode section (.llvmbc on non-MacOS).
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index a9a33c7617d7..2111e82e1a99 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -869,6 +869,9 @@ public:
   /// Combine insert vector element OOB.
   bool matchInsertVectorElementOOB(MachineInstr &MI, BuildFnTy &MatchInfo);
 
+  bool matchFreezeOfSingleMaybePoisonOperand(MachineInstr &MI,
+                                             BuildFnTy &MatchInfo);
+
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
   bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index 2a3145b635e6..2b3efc3b609f 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -34,6 +34,17 @@ public:
   static bool classof(const MachineInstr *MI) {
     return isPreISelGenericOpcode(MI->getOpcode());
   }
+
+  bool hasPoisonGeneratingFlags() const {
+    return getFlags() & (NoUWrap | NoSWrap | IsExact | Disjoint | NonNeg |
+                         FmNoNans | FmNoInfs);
+  }
+
+  void dropPoisonGeneratingFlags() {
+    clearFlags(NoUWrap | NoSWrap | IsExact | Disjoint | NonNeg | FmNoNans |
+               FmNoInfs);
+    assert(!hasPoisonGeneratingFlags());
+  }
 };
 
 /// Provides common memory operand functionality.
diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index 2b0c5d166d88..db48a0ae5514 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -416,6 +416,12 @@ public:
     Flags &= ~((uint32_t)Flag);
   }
 
+  void clearFlags(unsigned flags) {
+    assert(isUInt<LLVM_MI_FLAGS_BITS>(flags) &&
+           "flags to be cleared are out of range for the Flags field");
+    Flags &= ~flags;
+  }
+
   /// Return true if MI is in a bundle (but not the first MI in a bundle).
   ///
   /// A bundle looks like this before it's finalized:
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.h b/llvm/include/llvm/CodeGen/ValueTypes.h
index b66c66d1bfc4..dab6c421bf6e 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.h
+++ b/llvm/include/llvm/CodeGen/ValueTypes.h
@@ -488,8 +488,10 @@ namespace llvm {
     Type *getTypeForEVT(LLVMContext &Context) const;
 
     /// Return the value type corresponding to the specified type.
-    /// This returns all pointers as iPTR.  If HandleUnknown is true, unknown
-    /// types are returned as Other, otherwise they are invalid.
+    /// If HandleUnknown is true, unknown types are returned as Other,
+    /// otherwise they are invalid.
+    /// NB: This includes pointer types, which require a DataLayout to convert
+    /// to a concrete value type.
     static EVT getEVT(Type *Ty, bool HandleUnknown = false);
 
     intptr_t getRawBits() const {
diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index 900b30d9b024..c3e378ed8f6e 100644
--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -296,18 +296,23 @@ def MetadataVT : ValueType<0, 249> { // Metadata
 def iPTRAny    : VTAny<250>;
 
 // Pseudo valuetype to represent "vector of any size"
+// Should only be used in TableGen.
 def vAny       : VTAny<251>;
 
 // Pseudo valuetype to represent "float of any format"
+// Should only be used in TableGen.
 def fAny       : VTAny<252>;
 
 // Pseudo valuetype to represent "integer of any bit width"
+// Should only be used in TableGen.
 def iAny       : VTAny<253>;
 
 // Pseudo valuetype mapped to the current pointer size.
+// Should only be used in TableGen.
 def iPTR       : ValueType<0, 254>;
 
 // Pseudo valuetype to represent "any type of any size".
+// Should only be used in TableGen.
 def Any        : VTAny<255>;
 
 } // end defset ValueTypes
diff --git a/llvm/include/llvm/CodeGenTypes/MachineValueType.h b/llvm/include/llvm/CodeGenTypes/MachineValueType.h
index 9aceb9896021..3b2a9b535c09 100644
--- a/llvm/include/llvm/CodeGenTypes/MachineValueType.h
+++ b/llvm/include/llvm/CodeGenTypes/MachineValueType.h
@@ -476,9 +476,11 @@ namespace llvm {
       return getVectorVT(VT, EC.getKnownMinValue());
     }
 
-    /// Return the value type corresponding to the specified type.  This returns
-    /// all pointers as iPTR.  If HandleUnknown is true, unknown types are
-    /// returned as Other, otherwise they are invalid.
+    /// Return the value type corresponding to the specified type.
+    /// If HandleUnknown is true, unknown types are returned as Other,
+    /// otherwise they are invalid.
+    /// NB: This includes pointer types, which require a DataLayout to convert
+    /// to a concrete value type.
     static MVT getVT(Type *Ty, bool HandleUnknown = false);
 
   public:
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index 977c182e9d2b..ff30741c8f36 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -209,9 +209,6 @@
 /* Define to 1 if you have the <sys/types.h> header file. */
 #cmakedefine HAVE_SYS_TYPES_H ${HAVE_SYS_TYPES_H}
 
-/* Define if the setupterm() function is supported this platform. */
-#cmakedefine LLVM_ENABLE_TERMINFO ${LLVM_ENABLE_TERMINFO}
-
 /* Define to 1 if you have the <termios.h> header file. */
 #cmakedefine HAVE_TERMIOS_H ${HAVE_TERMIOS_H}
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
index 13a37265762a..5a3f8c605959 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h
@@ -19,7 +19,7 @@
 // - EmptyTrait: the class has no data members.
 // - WrapperTrait: the class has a single member `v`
 // - TupleTrait: the class has a tuple member `t`
-// - UnionTrait the class has a varuant member `u`
+// - UnionTrait the class has a variant member `u`
 // - IncompleteTrait: the class is a placeholder class that is currently empty,
 //   but will be completed at a later time.
 // Note: This structure follows the one used in flang parser.
diff --git a/llvm/include/llvm/IR/ConstantRange.h b/llvm/include/llvm/IR/ConstantRange.h
index e718e6e7e340..a5e2f809ab41 100644
--- a/llvm/include/llvm/IR/ConstantRange.h
+++ b/llvm/include/llvm/IR/ConstantRange.h
@@ -419,6 +419,15 @@ public:
   /// treating both this and \p Other as unsigned ranges.
   ConstantRange multiply(const ConstantRange &Other) const;
 
+  /// Return a new range representing the possible values resulting
+  /// from a multiplication with wrap type \p NoWrapKind of a value in this
+  /// range and a value in \p Other.
+  /// If the result range is disjoint, the preferred range is determined by the
+  /// \p PreferredRangeType.
+  ConstantRange
+  multiplyWithNoWrap(const ConstantRange &Other, unsigned NoWrapKind,
+                     PreferredRangeType RangeType = Smallest) const;
+
   /// Return range of possible values for a signed multiplication of this and
   /// \p Other. However, if overflow is possible always return a full range
   /// rather than trying to determine a more precise result.
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 0d8746344a44..40a9cf507248 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -455,7 +455,7 @@ public:
   /// block.
   GlobalVariable *CreateGlobalString(StringRef Str, const Twine &Name = "",
                                      unsigned AddressSpace = 0,
-                                     Module *M = nullptr);
+                                     Module *M = nullptr, bool AddNull = true);
 
   /// Get a constant value representing either true or false.
   ConstantInt *getInt1(bool V) {
@@ -1992,8 +1992,9 @@ public:
   /// block.
   Constant *CreateGlobalStringPtr(StringRef Str, const Twine &Name = "",
                                   unsigned AddressSpace = 0,
-                                  Module *M = nullptr) {
-    GlobalVariable *GV = CreateGlobalString(Str, Name, AddressSpace, M);
+                                  Module *M = nullptr, bool AddNull = true) {
+    GlobalVariable *GV =
+        CreateGlobalString(Str, Name, AddressSpace, M, AddNull);
     Constant *Zero = ConstantInt::get(Type::getInt32Ty(Context), 0);
     Constant *Indices[] = {Zero, Zero};
     return ConstantExpr::getInBoundsGetElementPtr(GV->getValueType(), GV,
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index be8048ca2459..d4a8954a4cda 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2466,25 +2466,27 @@ def int_amdgcn_perm :
 // GFX9 Intrinsics
 //===----------------------------------------------------------------------===//
 
-class AMDGPUGlobalLoadLDS : Intrinsic <
-  [],
-  [LLVMQualPointerType<1>,             // Base global pointer to load from
-   LLVMQualPointerType<3>,             // LDS base pointer to store to
-   llvm_i32_ty,                        // Data byte size: 1/2/4
-   llvm_i32_ty,                        // imm offset (applied to both global and LDS address)
-   llvm_i32_ty],                       // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0,
-                                       //                                   bit 1 = slc/sc1,
-                                       //                                   bit 2 = dlc on gfx10/gfx11))
-                                       //                                   bit 4 = scc/nt on gfx90a+))
-                                       //                  gfx12+:
-                                       //                      cachepolicy (bits [0-2] = th,
-                                       //                                   bits [3-4] = scope)
-                                       //                      swizzled buffer (bit 6 = swz),
-  [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
-   ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
-  "", [SDNPMemOperand]>;
+class AMDGPUGlobalLoadLDS :
+  ClangBuiltin<"__builtin_amdgcn_global_load_lds">,
+  Intrinsic <
+    [],
+    [LLVMQualPointerType<1>,            // Base global pointer to load from
+     LLVMQualPointerType<3>,            // LDS base pointer to store to
+     llvm_i32_ty,                       // Data byte size: 1/2/4
+     llvm_i32_ty,                       // imm offset (applied to both global and LDS address)
+     llvm_i32_ty],                      // auxiliary data (imm, cachepolicy (bit 0 = sc0,
+                                        //                                   bit 1 = sc1,
+                                        //                                   bit 4 = scc))
+    [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
+     ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
+     "", [SDNPMemOperand]>;
 def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
 
+// Use read/write of inaccessible memory to model the fact that this reads a
+// volatile value.
+def int_amdgcn_pops_exiting_wave_id :
+  DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrInaccessibleMemOnly]>;
+
 //===----------------------------------------------------------------------===//
 // GFX10 Intrinsics
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index cc84decc4340..90f12674d047 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -36,6 +36,7 @@ let TargetPrefix = "spv" in {
   def int_spv_alloca : Intrinsic<[llvm_any_ty], []>;
   def int_spv_alloca_array : Intrinsic<[llvm_any_ty], [llvm_anyint_ty]>;
   def int_spv_undef : Intrinsic<[llvm_i32_ty], []>;
+  def int_spv_inline_asm : Intrinsic<[], [llvm_metadata_ty, llvm_metadata_ty, llvm_vararg_ty]>;
 
   // Expect, Assume Intrinsics
   def int_spv_assume : Intrinsic<[], [llvm_i1_ty]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index 572d334ac955..237f268784bb 100644
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -337,6 +337,14 @@ def int_wasm_storef16_f32:
             [llvm_float_ty, llvm_ptr_ty],
             [IntrWriteMem, IntrArgMemOnly],
              "", [SDNPMemOperand]>;
+def int_wasm_splat_f16x8:
+  DefaultAttrsIntrinsic<[llvm_v8f16_ty],
+                        [llvm_float_ty],
+                        [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_extract_lane_f16x8:
+  DefaultAttrsIntrinsic<[llvm_float_ty],
+                        [llvm_v8f16_ty, llvm_i32_ty],
+                        [IntrNoMem, IntrSpeculatable]>;
 
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index fdc2b0fb7f80..aee804047e1b 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -3843,58 +3843,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
       DefaultAttrsIntrinsic<[llvm_v16f32_ty],
                             [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty],
                             [IntrNoMem]>;
-
-  def int_x86_avx512_rcp28_ps : ClangBuiltin<"__builtin_ia32_rcp28ps_mask">,
-      DefaultAttrsIntrinsic<[llvm_v16f32_ty],
-                            [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
-                             llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
-  def int_x86_avx512_rcp28_pd : ClangBuiltin<"__builtin_ia32_rcp28pd_mask">,
-      DefaultAttrsIntrinsic<[llvm_v8f64_ty],
-                            [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
-                             llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
-  def int_x86_avx512_exp2_ps : ClangBuiltin<"__builtin_ia32_exp2ps_mask">,
-      DefaultAttrsIntrinsic<[llvm_v16f32_ty],
-                            [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
-                             llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
-  def int_x86_avx512_exp2_pd : ClangBuiltin<"__builtin_ia32_exp2pd_mask">,
-      DefaultAttrsIntrinsic<[llvm_v8f64_ty],
-                            [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
-                             llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
-
-  def int_x86_avx512_rcp28_ss : ClangBuiltin<"__builtin_ia32_rcp28ss_round_mask">,
-      DefaultAttrsIntrinsic<[llvm_v4f32_ty],
-                            [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
-                             llvm_i8_ty, llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_rcp28_sd : ClangBuiltin<"__builtin_ia32_rcp28sd_round_mask">,
-      DefaultAttrsIntrinsic<[llvm_v2f64_ty],
-                            [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
-                             llvm_i8_ty, llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_rsqrt28_ps : ClangBuiltin<"__builtin_ia32_rsqrt28ps_mask">,
-      DefaultAttrsIntrinsic<[llvm_v16f32_ty],
-                            [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
-                             llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
-  def int_x86_avx512_rsqrt28_pd : ClangBuiltin<"__builtin_ia32_rsqrt28pd_mask">,
-      DefaultAttrsIntrinsic<[llvm_v8f64_ty],
-                            [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
-                             llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<3>>]>;
-  def int_x86_avx512_rsqrt28_ss : ClangBuiltin<"__builtin_ia32_rsqrt28ss_round_mask">,
-      DefaultAttrsIntrinsic<[llvm_v4f32_ty],
-                            [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
-                             llvm_i8_ty, llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_rsqrt28_sd : ClangBuiltin<"__builtin_ia32_rsqrt28sd_round_mask">,
-      DefaultAttrsIntrinsic<[llvm_v2f64_ty],
-                            [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
-                             llvm_i8_ty, llvm_i32_ty],
-                            [IntrNoMem, ImmArg<ArgIndex<4>>]>;
   def int_x86_avx512_psad_bw_512 : ClangBuiltin<"__builtin_ia32_psadbw512">,
       DefaultAttrsIntrinsic<[llvm_v8i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty],
                             [IntrNoMem, Commutative]>;
@@ -4177,38 +4125,6 @@ let TargetPrefix = "x86" in {
           Intrinsic<[],
           [llvm_ptr_ty, llvm_i8_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty],
           [ImmArg<ArgIndex<4>>]>;
-
-  // gather prefetch
-  // NOTE: These can't be ArgMemOnly because you can put the address completely
-  // in the index register.
-  def int_x86_avx512_gatherpf_dpd_512  : ClangBuiltin<"__builtin_ia32_gatherpfdpd">,
-          Intrinsic<[], [llvm_i8_ty, llvm_v8i32_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_gatherpf_dps_512  : ClangBuiltin<"__builtin_ia32_gatherpfdps">,
-          Intrinsic<[], [llvm_i16_ty, llvm_v16i32_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_gatherpf_qpd_512  : ClangBuiltin<"__builtin_ia32_gatherpfqpd">,
-          Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_gatherpf_qps_512  : ClangBuiltin<"__builtin_ia32_gatherpfqps">,
-          Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-
-  // scatter prefetch
-  // NOTE: These can't be ArgMemOnly because you can put the address completely
-  // in the index register.
-  def int_x86_avx512_scatterpf_dpd_512  : ClangBuiltin<"__builtin_ia32_scatterpfdpd">,
-          Intrinsic<[], [llvm_i8_ty, llvm_v8i32_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_scatterpf_dps_512  : ClangBuiltin<"__builtin_ia32_scatterpfdps">,
-          Intrinsic<[], [llvm_i16_ty, llvm_v16i32_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_scatterpf_qpd_512  : ClangBuiltin<"__builtin_ia32_scatterpfqpd">,
-          Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
-  def int_x86_avx512_scatterpf_qps_512  : ClangBuiltin<"__builtin_ia32_scatterpfqps">,
-          Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
-                     llvm_i32_ty, llvm_i32_ty], [ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>;
 }
 
 // AVX512 gather/scatter intrinsics that use vXi1 masks.
diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
index 20f5bb2b531d..8eced073501e 100644
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -174,10 +174,10 @@ HELPER_REGISTER_BINARY_INT_VP(vp_add, VP_ADD, Add, ADD)
 HELPER_REGISTER_BINARY_INT_VP(vp_and, VP_AND, And, AND)
 
 // llvm.vp.ashr(x,y,mask,vlen)
-HELPER_REGISTER_BINARY_INT_VP(vp_ashr, VP_ASHR, AShr, SRA)
+HELPER_REGISTER_BINARY_INT_VP(vp_ashr, VP_SRA, AShr, SRA)
 
 // llvm.vp.lshr(x,y,mask,vlen)
-HELPER_REGISTER_BINARY_INT_VP(vp_lshr, VP_LSHR, LShr, SRL)
+HELPER_REGISTER_BINARY_INT_VP(vp_lshr, VP_SRL, LShr, SRL)
 
 // llvm.vp.mul(x,y,mask,vlen)
 HELPER_REGISTER_BINARY_INT_VP(vp_mul, VP_MUL, Mul, MUL)
diff --git a/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h b/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
index c450acda82ad..f1337e82485c 100644
--- a/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
+++ b/llvm/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
@@ -271,12 +271,13 @@ public:
                          const lto::InputFile &File);
 
   /**
-   * Compute the list of summaries needed for importing into module.
+   * Compute the list of summaries and the subset of declaration summaries
+   * needed for importing into module.
    */
   void gatherImportedSummariesForModule(
       Module &Module, ModuleSummaryIndex &Index,
       std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex,
-      const lto::InputFile &File);
+      GVSummaryPtrSet &DecSummaries, const lto::InputFile &File);
 
   /**
    * Perform internalization. Index is updated to reflect linkage changes.
diff --git a/llvm/include/llvm/MCA/InstrBuilder.h b/llvm/include/llvm/MCA/InstrBuilder.h
index 359437248914..00c7942e4fa1 100644
--- a/llvm/include/llvm/MCA/InstrBuilder.h
+++ b/llvm/include/llvm/MCA/InstrBuilder.h
@@ -78,6 +78,7 @@ class InstrBuilder {
 
   bool FirstCallInst;
   bool FirstReturnInst;
+  unsigned CallLatency;
 
   using InstRecycleCallback = std::function<Instruction *(const InstrDesc &)>;
   InstRecycleCallback InstRecycleCB;
@@ -98,7 +99,7 @@ class InstrBuilder {
 public:
   InstrBuilder(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
                const MCRegisterInfo &RI, const MCInstrAnalysis *IA,
-               const InstrumentManager &IM);
+               const InstrumentManager &IM, unsigned CallLatency);
 
   void clear() {
     Descriptors.clear();
diff --git a/llvm/include/llvm/Object/ObjectFile.h b/llvm/include/llvm/Object/ObjectFile.h
index 8c868c7643ed..f49763e31a9c 100644
--- a/llvm/include/llvm/Object/ObjectFile.h
+++ b/llvm/include/llvm/Object/ObjectFile.h
@@ -302,6 +302,7 @@ protected:
 public:
   ObjectFile() = delete;
   ObjectFile(const ObjectFile &other) = delete;
+  ObjectFile &operator=(const ObjectFile &other) = delete;
 
   uint64_t getCommonSymbolSize(DataRefImpl Symb) const {
     Expected<uint32_t> SymbolFlagsOrErr = getSymbolFlags(Symb);
diff --git a/llvm/include/llvm/Option/ArgList.h b/llvm/include/llvm/Option/ArgList.h
index fcde68e0b7fe..09812f976d01 100644
--- a/llvm/include/llvm/Option/ArgList.h
+++ b/llvm/include/llvm/Option/ArgList.h
@@ -319,11 +319,15 @@ public:
   }
 
   /// Render only the last argument match \p Id0, if present.
-  template<typename ...OptSpecifiers>
-  void AddLastArg(ArgStringList &Output, OptSpecifiers ...Ids) const {
+  template <typename... OptSpecifiers>
+  void addLastArg(ArgStringList &Output, OptSpecifiers... Ids) const {
     if (Arg *A = getLastArg(Ids...)) // Calls claim() on all Ids's Args.
       A->render(*this, Output);
   }
+  template <typename... OptSpecifiers>
+  void AddLastArg(ArgStringList &Output, OptSpecifiers... Ids) const {
+    addLastArg(Output, Ids...);
+  }
 
   /// AddAllArgsExcept - Render all arguments matching any of the given ids
   /// and not matching any of the excluded ids.
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 88c7fe425b5a..2cee928b210e 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -385,8 +385,9 @@ struct TemporalProfTraceTy {
   /// Use a set of temporal profile traces to create a list of balanced
   /// partitioning function nodes used by BalancedPartitioning to generate a
   /// function order that reduces page faults during startup
-  static std::vector<BPFunctionNode>
-  createBPFunctionNodes(ArrayRef<TemporalProfTraceTy> Traces);
+  static void createBPFunctionNodes(ArrayRef<TemporalProfTraceTy> Traces,
+                                    std::vector<BPFunctionNode> &Nodes,
+                                    bool RemoveOutlierUNs = true);
 };
 
 inline std::error_code make_error_code(instrprof_error E) {
@@ -1184,35 +1185,32 @@ inline uint64_t ComputeHash(StringRef K) { return ComputeHash(HashType, K); }
 // data file in indexed-format. Please update llvm/docs/InstrProfileFormat.rst
 // as appropriate when updating the indexed profile format.
 struct Header {
-  uint64_t Magic;
+  uint64_t Magic = IndexedInstrProf::Magic;
   // The lower 32 bits specify the version of the indexed profile.
   // The most significant 32 bits are reserved to specify the variant types of
   // the profile.
-  uint64_t Version;
-  uint64_t Unused; // Becomes unused since version 4
-  uint64_t HashType;
+  uint64_t Version = 0;
+  uint64_t Unused = 0; // Becomes unused since version 4
+  uint64_t HashType = static_cast<uint64_t>(IndexedInstrProf::HashType);
   // This field records the offset of this hash table's metadata (i.e., the
   // number of buckets and entries), which follows right after the payload of
   // the entire hash table.
-  uint64_t HashOffset;
-  uint64_t MemProfOffset;
-  uint64_t BinaryIdOffset;
-  uint64_t TemporalProfTracesOffset;
-  uint64_t VTableNamesOffset;
+  uint64_t HashOffset = 0;
+  uint64_t MemProfOffset = 0;
+  uint64_t BinaryIdOffset = 0;
+  uint64_t TemporalProfTracesOffset = 0;
+  uint64_t VTableNamesOffset = 0;
   // New fields should only be added at the end to ensure that the size
   // computation is correct. The methods below need to be updated to ensure that
   // the new field is read correctly.
 
-  // Reads a header struct from the buffer.
+  // Reads a header struct from the buffer. Header fields are in machine native
+  // endianness.
   static Expected<Header> readFromBuffer(const unsigned char *Buffer);
 
   // Returns the size of the header in bytes for all valid fields based on the
   // version. I.e a older version header will return a smaller size.
   size_t size() const;
-
-  // Returns the format version in little endian. The header retains the version
-  // in native endian of the compiler runtime.
-  uint64_t formatVersion() const;
 };
 
 // Profile summary data recorded in the profile data file in indexed
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index 9b35768205f9..46aa1b6c2bfe 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -649,6 +649,8 @@ public:
 
 class IndexedMemProfReader {
 private:
+  /// The MemProf version.
+  memprof::IndexedVersion Version = memprof::Version0;
   /// MemProf profile schema (if available).
   memprof::MemProfSchema Schema;
   /// MemProf record profile data on-disk indexed via llvm::md5(FunctionName).
diff --git a/llvm/include/llvm/ProfileData/InstrProfWriter.h b/llvm/include/llvm/ProfileData/InstrProfWriter.h
index 97f6a95ab715..b8b6c684717b 100644
--- a/llvm/include/llvm/ProfileData/InstrProfWriter.h
+++ b/llvm/include/llvm/ProfileData/InstrProfWriter.h
@@ -218,6 +218,9 @@ private:
   // back patching.
   uint64_t writeHeader(const IndexedInstrProf::Header &header,
                        const bool WritePrevVersion, ProfOStream &OS);
+
+  // Writes compressed vtable names to profiles.
+  Error writeVTableNames(ProfOStream &OS);
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Support/Error.h b/llvm/include/llvm/Support/Error.h
index 217130ce293a..662c3ea46e3c 100644
--- a/llvm/include/llvm/Support/Error.h
+++ b/llvm/include/llvm/Support/Error.h
@@ -1236,10 +1236,10 @@ class StringError : public ErrorInfo<StringError> {
 public:
   static char ID;
 
-  // Prints EC + S and converts to EC
+  StringError(std::string &&S, std::error_code EC, bool PrintMsgOnly);
+  /// Prints EC + S and converts to EC.
   StringError(std::error_code EC, const Twine &S = Twine());
-
-  // Prints S and converts to EC
+  /// Prints S and converts to EC.
   StringError(const Twine &S, std::error_code EC);
 
   void log(raw_ostream &OS) const override;
@@ -1258,15 +1258,23 @@ template <typename... Ts>
 inline Error createStringError(std::error_code EC, char const *Fmt,
                                const Ts &... Vals) {
   std::string Buffer;
-  raw_string_ostream Stream(Buffer);
-  Stream << format(Fmt, Vals...);
-  return make_error<StringError>(Stream.str(), EC);
+  raw_string_ostream(Buffer) << format(Fmt, Vals...);
+  return make_error<StringError>(Buffer, EC);
 }
 
-Error createStringError(std::error_code EC, char const *Msg);
+Error createStringError(std::string &&Msg, std::error_code EC);
+
+inline Error createStringError(std::error_code EC, const char *S) {
+  return createStringError(std::string(S), EC);
+}
 
 inline Error createStringError(std::error_code EC, const Twine &S) {
-  return createStringError(EC, S.str().c_str());
+  return createStringError(S.str(), EC);
+}
+
+/// Create a StringError with an inconvertible error code.
+inline Error createStringError(const Twine &S) {
+  return createStringError(llvm::inconvertibleErrorCode(), S);
 }
 
 template <typename... Ts>
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 5d4b5a2479f6..8012f9192277 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -220,6 +220,13 @@ def idempotent_prop : GICombineRule<
    (match (idempotent_prop_frags $dst, $src)),
    (apply (GIReplaceReg $dst, $src))>;
 
+// Convert freeze(Op(Op0, NonPoisonOps...)) to Op(freeze(Op0), NonPoisonOps...)
+// when Op0 is not guaranteed non-poison
+def push_freeze_to_prevent_poison_from_propagating : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (G_FREEZE $dst, $src):$root,
+         [{ return !isGuaranteedNotToBePoison(${src}.getReg(), MRI) && Helper.matchFreezeOfSingleMaybePoisonOperand(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>;
 
 def extending_loads : GICombineRule<
   (defs root:$root, extending_load_matchdata:$matchinfo),
@@ -1634,6 +1641,78 @@ extract_vector_element_shuffle_vector,
 insert_vector_element_extract_vector_element
 ]>;
 
+
+// fold ((0-A) + B) -> B-A
+def ZeroMinusAPlusB : GICombineRule<
+   (defs root:$root),
+   (match (G_SUB $sub, 0, $A),
+          (G_ADD $root, $sub, $B)),
+   (apply (G_SUB $root, $B, $A))>;
+
+// fold (A + (0-B)) -> A-B
+def APlusZeroMinusB : GICombineRule<
+   (defs root:$root),
+   (match (G_SUB $sub, 0, $B),
+          (G_ADD $root, $A, $sub)),
+   (apply (G_SUB $root, $A, $B))>;
+
+ // fold (A+(B-A)) -> B
+ def APlusBMinusB : GICombineRule<
+   (defs root:$root),
+   (match (G_SUB $sub, $B, $A),
+          (G_ADD $root, $A, $sub)),
+   (apply (GIReplaceReg $root, $B))>;
+
+// fold ((B-A)+A) -> B
+ def BMinusAPlusA : GICombineRule<
+   (defs root:$root),
+   (match (G_SUB $sub, $B, $A),
+          (G_ADD $root, $sub, $A)),
+   (apply (GIReplaceReg $root, $B))>;
+
+// fold ((A-B)+(C-A)) -> (C-B)
+def AMinusBPlusCMinusA : GICombineRule<
+   (defs root:$root),
+   (match (G_SUB $sub1, $A, $B),
+          (G_SUB $sub2, $C, $A),
+          (G_ADD $root, $sub1, $sub2)),
+   (apply (G_SUB $root, $C, $B))>;
+
+// fold ((A-B)+(B-C)) -> (A-C)
+def AMinusBPlusBMinusC : GICombineRule<
+   (defs root:$root),
+   (match (G_SUB $sub1, $A, $B),
+          (G_SUB $sub2, $B, $C),
+          (G_ADD $root, $sub1, $sub2)),
+   (apply (G_SUB $root, $A, $C))>;
+
+// fold (A+(B-(A+C))) to (B-C)
+def APlusBMinusAplusC : GICombineRule<
+   (defs root:$root),
+   (match (G_ADD $add1, $A, $C),
+          (G_SUB $sub1, $B, $add1),
+          (G_ADD $root, $A, $sub1)),
+   (apply (G_SUB $root, $B, $C))>;
+
+// fold (A+(B-(C+A))) to (B-C)
+def APlusBMinusCPlusA : GICombineRule<
+   (defs root:$root),
+   (match (G_ADD $add1, $C, $A),
+          (G_SUB $sub1, $B, $add1),
+          (G_ADD $root, $A, $sub1)),
+   (apply (G_SUB $root, $B, $C))>;
+
+def integer_reassoc_combines: GICombineGroup<[
+  ZeroMinusAPlusB,
+  APlusZeroMinusB,
+  APlusBMinusB,
+  BMinusAPlusA,
+  AMinusBPlusCMinusA,
+  AMinusBPlusBMinusC,
+  APlusBMinusAplusC,
+  APlusBMinusCPlusA
+]>;
+
 // FIXME: These should use the custom predicate feature once it lands.
 def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
                                      undef_to_negative_one,
@@ -1691,7 +1770,8 @@ def fma_combines : GICombineGroup<[combine_fadd_fmul_to_fmad_or_fma,
 def constant_fold_binops : GICombineGroup<[constant_fold_binop,
                                            constant_fold_fp_binop]>;
 
-def all_combines : GICombineGroup<[trivial_combines, vector_ops_combines,
+def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines,
+    vector_ops_combines,
     insert_vec_elt_combines, extract_vec_elt_combines, combines_for_extload,
     combine_extracted_vector_load,
     undef_combines, identity_combines, phi_combines,
@@ -1713,7 +1793,8 @@ def all_combines : GICombineGroup<[trivial_combines, vector_ops_combines,
     sub_add_reg, select_to_minmax, redundant_binop_in_equality,
     fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors,
     combine_concat_vector, double_icmp_zero_and_or_combine, match_addos,
-    sext_trunc, zext_trunc, combine_shuffle_concat]>;
+    sext_trunc, zext_trunc, combine_shuffle_concat,
+    push_freeze_to_prevent_poison_from_propagating]>;
 
 // A combine group used to for prelegalizer combiners at -O0. The combines in
 // this group have been selected based on experiments to balance code size and
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index 5670767ff7ed..8daa8a689c95 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -159,20 +159,20 @@ X86_FEATURE_COMPAT(AVX512VL,        "avx512vl",              20)
 X86_FEATURE_COMPAT(AVX512BW,        "avx512bw",              21)
 X86_FEATURE_COMPAT(AVX512DQ,        "avx512dq",              22)
 X86_FEATURE_COMPAT(AVX512CD,        "avx512cd",              23)
-X86_FEATURE_COMPAT(AVX512ER,        "avx512er",              24)
-X86_FEATURE_COMPAT(AVX512PF,        "avx512pf",              25)
-X86_FEATURE_COMPAT(AVX512VBMI,      "avx512vbmi",            26)
-X86_FEATURE_COMPAT(AVX512IFMA,      "avx512ifma",            27)
-X86_FEATURE_COMPAT(AVX5124VNNIW,    "avx5124vnniw",          28)
-X86_FEATURE_COMPAT(AVX5124FMAPS,    "avx5124fmaps",          29)
-X86_FEATURE_COMPAT(AVX512VPOPCNTDQ, "avx512vpopcntdq",       30)
-X86_FEATURE_COMPAT(AVX512VBMI2,     "avx512vbmi2",           31)
-X86_FEATURE_COMPAT(GFNI,            "gfni",                  32)
-X86_FEATURE_COMPAT(VPCLMULQDQ,      "vpclmulqdq",            33)
-X86_FEATURE_COMPAT(AVX512VNNI,      "avx512vnni",            34)
-X86_FEATURE_COMPAT(AVX512BITALG,    "avx512bitalg",          35)
-X86_FEATURE_COMPAT(AVX512BF16,      "avx512bf16",            36)
-X86_FEATURE_COMPAT(AVX512VP2INTERSECT, "avx512vp2intersect", 37)
+X86_FEATURE       (NF,              "nf")
+X86_FEATURE       (CF,              "cf")
+X86_FEATURE_COMPAT(AVX512VBMI,      "avx512vbmi",            24)
+X86_FEATURE_COMPAT(AVX512IFMA,      "avx512ifma",            25)
+X86_FEATURE_COMPAT(AVX5124VNNIW,    "avx5124vnniw",          26)
+X86_FEATURE_COMPAT(AVX5124FMAPS,    "avx5124fmaps",          27)
+X86_FEATURE_COMPAT(AVX512VPOPCNTDQ, "avx512vpopcntdq",       28)
+X86_FEATURE_COMPAT(AVX512VBMI2,     "avx512vbmi2",           29)
+X86_FEATURE_COMPAT(GFNI,            "gfni",                  30)
+X86_FEATURE_COMPAT(VPCLMULQDQ,      "vpclmulqdq",            31)
+X86_FEATURE_COMPAT(AVX512VNNI,      "avx512vnni",            32)
+X86_FEATURE_COMPAT(AVX512BITALG,    "avx512bitalg",          33)
+X86_FEATURE_COMPAT(AVX512BF16,      "avx512bf16",            34)
+X86_FEATURE_COMPAT(AVX512VP2INTERSECT, "avx512vp2intersect", 35)
 // Below Features has some missings comparing to gcc, it's because gcc has some
 // not one-to-one mapped in llvm.
 X86_FEATURE_COMPAT(3DNOW,           "3dnow",                  0)
@@ -202,7 +202,7 @@ X86_FEATURE_COMPAT(MWAITX,          "mwaitx",                 0)
 X86_FEATURE       (X87,             "x87")
 X86_FEATURE_COMPAT(PCONFIG,         "pconfig",                0)
 X86_FEATURE_COMPAT(PKU,             "pku",                    0)
-X86_FEATURE_COMPAT(PREFETCHWT1,     "prefetchwt1",            0)
+X86_FEATURE       (EVEX512,         "evex512")
 X86_FEATURE_COMPAT(PRFCHW,          "prfchw",                 0)
 X86_FEATURE_COMPAT(PTWRITE,         "ptwrite",                0)
 X86_FEATURE_COMPAT(RDPID,           "rdpid",                  0)
@@ -252,9 +252,6 @@ X86_FEATURE       (EGPR,            "egpr")
 X86_FEATURE_COMPAT(USERMSR,         "usermsr",                0)
 X86_FEATURE_COMPAT(AVX10_1,         "avx10.1-256",            0)
 X86_FEATURE_COMPAT(AVX10_1_512,     "avx10.1-512",            0)
-X86_FEATURE       (EVEX512,         "evex512")
-X86_FEATURE       (NF,              "nf")
-X86_FEATURE       (CF,              "cf")
 // These features aren't really CPU features, but the frontend can set them.
 X86_FEATURE       (RETPOLINE_EXTERNAL_THUNK,    "retpoline-external-thunk")
 X86_FEATURE       (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches")
diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
index 024bba8105b8..72a0823c6627 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h
@@ -212,11 +212,15 @@ bool convertToDeclaration(GlobalValue &GV);
 /// \p ModuleToSummariesForIndex will be populated with the needed summaries
 /// from each required module path. Use a std::map instead of StringMap to get
 /// stable order for bitcode emission.
+///
+/// \p DecSummaries will be popluated with the subset of of summary pointers
+/// that have 'declaration' import type among all summaries the module need.
 void gatherImportedSummariesForModule(
     StringRef ModulePath,
     const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
     const FunctionImporter::ImportMapTy &ImportList,
-    std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex);
+    std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex,
+    GVSummaryPtrSet &DecSummaries);
 
 /// Emit into \p OutputFilename the files module \p ModulePath will import from.
 std::error_code EmitImportsFiles(
diff --git a/llvm/lib/Analysis/CFG.cpp b/llvm/lib/Analysis/CFG.cpp
index 8528aa9f77e0..841b83505238 100644
--- a/llvm/lib/Analysis/CFG.cpp
+++ b/llvm/lib/Analysis/CFG.cpp
@@ -130,14 +130,21 @@ static const Loop *getOutermostLoop(const LoopInfo *LI, const BasicBlock *BB) {
   return L ? L->getOutermostLoop() : nullptr;
 }
 
-bool llvm::isPotentiallyReachableFromMany(
-    SmallVectorImpl<BasicBlock *> &Worklist, const BasicBlock *StopBB,
-    const SmallPtrSetImpl<BasicBlock *> *ExclusionSet, const DominatorTree *DT,
-    const LoopInfo *LI) {
-  // When the stop block is unreachable, it's dominated from everywhere,
+template <class StopSetT>
+static bool isReachableImpl(SmallVectorImpl<BasicBlock *> &Worklist,
+                            const StopSetT &StopSet,
+                            const SmallPtrSetImpl<BasicBlock *> *ExclusionSet,
+                            const DominatorTree *DT, const LoopInfo *LI) {
+  // When a stop block is unreachable, it's dominated from everywhere,
   // regardless of whether there's a path between the two blocks.
-  if (DT && !DT->isReachableFromEntry(StopBB))
-    DT = nullptr;
+  if (DT) {
+    for (auto *BB : StopSet) {
+      if (!DT->isReachableFromEntry(BB)) {
+        DT = nullptr;
+        break;
+      }
+    }
+  }
 
   // We can't skip directly from a block that dominates the stop block if the
   // exclusion block is potentially in between.
@@ -155,7 +162,13 @@ bool llvm::isPotentiallyReachableFromMany(
     }
   }
 
-  const Loop *StopLoop = LI ? getOutermostLoop(LI, StopBB) : nullptr;
+  SmallPtrSet<const Loop *, 2> StopLoops;
+  if (LI) {
+    for (auto *StopSetBB : StopSet) {
+      if (const Loop *L = getOutermostLoop(LI, StopSetBB))
+        StopLoops.insert(L);
+    }
+  }
 
   unsigned Limit = DefaultMaxBBsToExplore;
   SmallPtrSet<const BasicBlock*, 32> Visited;
@@ -163,12 +176,16 @@ bool llvm::isPotentiallyReachableFromMany(
     BasicBlock *BB = Worklist.pop_back_val();
     if (!Visited.insert(BB).second)
       continue;
-    if (BB == StopBB)
+    if (StopSet.contains(BB))
       return true;
     if (ExclusionSet && ExclusionSet->count(BB))
       continue;
-    if (DT && DT->dominates(BB, StopBB))
-      return true;
+    if (DT) {
+      if (llvm::any_of(StopSet, [&](const BasicBlock *StopBB) {
+            return DT->dominates(BB, StopBB);
+          }))
+        return true;
+    }
 
     const Loop *Outer = nullptr;
     if (LI) {
@@ -179,7 +196,7 @@ bool llvm::isPotentiallyReachableFromMany(
       // excluded block. Clear Outer so we process BB's successors.
       if (LoopsWithHoles.count(Outer))
         Outer = nullptr;
-      if (StopLoop && Outer == StopLoop)
+      if (StopLoops.contains(Outer))
         return true;
     }
 
@@ -204,6 +221,39 @@ bool llvm::isPotentiallyReachableFromMany(
   return false;
 }
 
+template <class T> class SingleEntrySet {
+public:
+  using const_iterator = const T *;
+
+  SingleEntrySet(T Elem) : Elem(Elem) {}
+
+  bool contains(T Other) const { return Elem == Other; }
+
+  const_iterator begin() const { return &Elem; }
+  const_iterator end() const { return &Elem + 1; }
+
+private:
+  T Elem;
+};
+
+bool llvm::isPotentiallyReachableFromMany(
+    SmallVectorImpl<BasicBlock *> &Worklist, const BasicBlock *StopBB,
+    const SmallPtrSetImpl<BasicBlock *> *ExclusionSet, const DominatorTree *DT,
+    const LoopInfo *LI) {
+  return isReachableImpl<SingleEntrySet<const BasicBlock *>>(
+      Worklist, SingleEntrySet<const BasicBlock *>(StopBB), ExclusionSet, DT,
+      LI);
+}
+
+bool llvm::isManyPotentiallyReachableFromMany(
+    SmallVectorImpl<BasicBlock *> &Worklist,
+    const SmallPtrSetImpl<const BasicBlock *> &StopSet,
+    const SmallPtrSetImpl<BasicBlock *> *ExclusionSet, const DominatorTree *DT,
+    const LoopInfo *LI) {
+  return isReachableImpl<SmallPtrSetImpl<const BasicBlock *>>(
+      Worklist, StopSet, ExclusionSet, DT, LI);
+}
+
 bool llvm::isPotentiallyReachable(
     const BasicBlock *A, const BasicBlock *B,
     const SmallPtrSetImpl<BasicBlock *> *ExclusionSet, const DominatorTree *DT,
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 2a967f570c4a..bc8b9b8479e4 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -392,9 +392,9 @@ void RuntimePointerChecking::generateChecks(
 
 bool RuntimePointerChecking::needsChecking(
     const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const {
-  for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I)
-    for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J)
-      if (needsChecking(M.Members[I], N.Members[J]))
+  for (const auto &I : M.Members)
+    for (const auto &J : N.Members)
+      if (needsChecking(I, J))
         return true;
   return false;
 }
@@ -408,9 +408,7 @@ static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
 
   if (!C)
     return nullptr;
-  if (C->getValue()->isNegative())
-    return J;
-  return I;
+  return C->getValue()->isNegative() ? J : I;
 }
 
 bool RuntimeCheckingPtrGroup::addPointer(unsigned Index,
@@ -508,8 +506,8 @@ void RuntimePointerChecking::groupChecks(
 
   DenseMap<Value *, SmallVector<unsigned>> PositionMap;
   for (unsigned Index = 0; Index < Pointers.size(); ++Index) {
-    auto Iter = PositionMap.insert({Pointers[Index].PointerValue, {}});
-    Iter.first->second.push_back(Index);
+    auto [It, _] = PositionMap.insert({Pointers[Index].PointerValue, {}});
+    It->second.push_back(Index);
   }
 
   // We need to keep track of what pointers we've already seen so we
@@ -608,16 +606,16 @@ void RuntimePointerChecking::printChecks(
     raw_ostream &OS, const SmallVectorImpl<RuntimePointerCheck> &Checks,
     unsigned Depth) const {
   unsigned N = 0;
-  for (const auto &Check : Checks) {
-    const auto &First = Check.first->Members, &Second = Check.second->Members;
+  for (const auto &[Check1, Check2] : Checks) {
+    const auto &First = Check1->Members, &Second = Check2->Members;
 
     OS.indent(Depth) << "Check " << N++ << ":\n";
 
-    OS.indent(Depth + 2) << "Comparing group (" << Check.first << "):\n";
+    OS.indent(Depth + 2) << "Comparing group (" << Check1 << "):\n";
     for (unsigned K = 0; K < First.size(); ++K)
       OS.indent(Depth + 2) << *Pointers[First[K]].PointerValue << "\n";
 
-    OS.indent(Depth + 2) << "Against group (" << Check.second << "):\n";
+    OS.indent(Depth + 2) << "Against group (" << Check2 << "):\n";
     for (unsigned K = 0; K < Second.size(); ++K)
       OS.indent(Depth + 2) << *Pointers[Second[K]].PointerValue << "\n";
   }
@@ -1158,8 +1156,8 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
     // First, count how many write and read accesses are in the alias set. Also
     // collect MemAccessInfos for later.
     SmallVector<MemAccessInfo, 4> AccessInfos;
-    for (const Value *Ptr_ : ASPointers) {
-      Value *Ptr = const_cast<Value *>(Ptr_);
+    for (const Value *ConstPtr : ASPointers) {
+      Value *Ptr = const_cast<Value *>(ConstPtr);
       bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
       if (IsWrite)
         ++NumWritePtrChecks;
@@ -1215,9 +1213,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
       // We know that we need these checks, so we can now be more aggressive
       // and add further checks if required (overflow checks).
       CanDoAliasSetRT = true;
-      for (auto Retry : Retries) {
-        MemAccessInfo Access = Retry.first;
-        Type *AccessTy = Retry.second;
+      for (const auto &[Access, AccessTy] : Retries) {
         if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
                                   DepSetId, TheLoop, RunningDepId, ASId,
                                   ShouldCheckWrap, /*Assume=*/true)) {
@@ -1289,12 +1285,11 @@ void AccessAnalysis::processMemAccesses() {
   LLVM_DEBUG(dbgs() << "  AST: "; AST.dump());
   LLVM_DEBUG(dbgs() << "LAA:   Accesses(" << Accesses.size() << "):\n");
   LLVM_DEBUG({
-    for (auto A : Accesses)
-      dbgs() << "\t" << *A.first.getPointer() << " ("
-             << (A.first.getInt()
-                     ? "write"
-                     : (ReadOnlyPtr.count(A.first.getPointer()) ? "read-only"
-                                                                : "read"))
+    for (const auto &[A, _] : Accesses)
+      dbgs() << "\t" << *A.getPointer() << " ("
+             << (A.getInt() ? "write"
+                            : (ReadOnlyPtr.count(A.getPointer()) ? "read-only"
+                                                                 : "read"))
              << ")\n";
   });
 
@@ -1323,16 +1318,16 @@ void AccessAnalysis::processMemAccesses() {
       bool UseDeferred = SetIteration > 0;
       PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses;
 
-      for (const Value *Ptr_ : ASPointers) {
-        Value *Ptr = const_cast<Value *>(Ptr_);
+      for (const Value *ConstPtr : ASPointers) {
+        Value *Ptr = const_cast<Value *>(ConstPtr);
 
         // For a single memory access in AliasSetTracker, Accesses may contain
         // both read and write, and they both need to be handled for CheckDeps.
-        for (const auto &AC : S) {
-          if (AC.first.getPointer() != Ptr)
+        for (const auto &[AC, _] : S) {
+          if (AC.getPointer() != Ptr)
             continue;
 
-          bool IsWrite = AC.first.getInt();
+          bool IsWrite = AC.getInt();
 
           // If we're using the deferred access set, then it contains only
           // reads.
@@ -1859,10 +1854,7 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
   // (If so, then we have proven (**) because |Dist| >= -1*Dist)
   const SCEV *NegDist = SE.getNegativeSCEV(CastedDist);
   Minus = SE.getMinusSCEV(NegDist, CastedProduct);
-  if (SE.isKnownPositive(Minus))
-    return true;
-
-  return false;
+  return SE.isKnownPositive(Minus);
 }
 
 /// Check the dependence for two accesses with the same stride \p Stride.
@@ -2050,7 +2042,7 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
   if (isa<SCEVCouldNotCompute>(Dist)) {
     // TODO: Relax requirement that there is a common stride to retry with
     // non-constant distance dependencies.
-    FoundNonConstantDistanceDependence |= !!CommonStride;
+    FoundNonConstantDistanceDependence |= CommonStride.has_value();
     LLVM_DEBUG(dbgs() << "LAA: Dependence because of uncomputable distance.\n");
     return Dependence::Unknown;
   }
@@ -2093,11 +2085,10 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
       if (HasSameSize) {
         // Write to the same location with the same size.
         return Dependence::Forward;
-      } else {
-        LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but "
-                             "different type sizes\n");
-        return Dependence::Unknown;
       }
+      LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but "
+                           "different type sizes\n");
+      return Dependence::Unknown;
     }
 
     bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
@@ -2343,7 +2334,7 @@ bool MemoryDepChecker::areDepsSafe(
           }
         ++OI;
       }
-      AI++;
+      ++AI;
     }
   }
 
@@ -2352,8 +2343,8 @@ bool MemoryDepChecker::areDepsSafe(
 }
 
 SmallVector<Instruction *, 4>
-MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool isWrite) const {
-  MemAccessInfo Access(Ptr, isWrite);
+MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool IsWrite) const {
+  MemAccessInfo Access(Ptr, IsWrite);
   auto &IndexVector = Accesses.find(Access)->second;
 
   SmallVector<Instruction *, 4> Insts;
@@ -2729,13 +2720,14 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
 }
 
 void LoopAccessInfo::emitUnsafeDependenceRemark() {
-  auto Deps = getDepChecker().getDependences();
+  const auto *Deps = getDepChecker().getDependences();
   if (!Deps)
     return;
-  auto Found = llvm::find_if(*Deps, [](const MemoryDepChecker::Dependence &D) {
-    return MemoryDepChecker::Dependence::isSafeForVectorization(D.Type) !=
-           MemoryDepChecker::VectorizationSafetyStatus::Safe;
-  });
+  const auto *Found =
+      llvm::find_if(*Deps, [](const MemoryDepChecker::Dependence &D) {
+        return MemoryDepChecker::Dependence::isSafeForVectorization(D.Type) !=
+               MemoryDepChecker::VectorizationSafetyStatus::Safe;
+      });
   if (Found == Deps->end())
     return;
   MemoryDepChecker::Dependence Dep = *Found;
@@ -2874,9 +2866,9 @@ static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
 
   // Check that all of the gep indices are uniform except for our induction
   // operand.
-  for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i)
-    if (i != InductionOperand &&
-        !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp))
+  for (unsigned I = 0, E = GEP->getNumOperands(); I != E; ++I)
+    if (I != InductionOperand &&
+        !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(I)), Lp))
       return Ptr;
   return GEP->getOperand(InductionOperand);
 }
@@ -3072,9 +3064,8 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
   DepChecker =
       std::make_unique<MemoryDepChecker>(*PSE, L, MaxTargetVectorWidthInBits);
   PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
-  if (canAnalyzeLoop()) {
+  if (canAnalyzeLoop())
     analyzeLoop(AA, LI, TLI, DT);
-  }
 }
 
 void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
@@ -3126,13 +3117,13 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
 }
 
 const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L) {
-  auto I = LoopAccessInfoMap.insert({&L, nullptr});
+  auto [It, Inserted] = LoopAccessInfoMap.insert({&L, nullptr});
 
-  if (I.second)
-    I.first->second =
+  if (Inserted)
+    It->second =
         std::make_unique<LoopAccessInfo>(&L, &SE, TTI, TLI, &AA, &DT, &LI);
 
-  return *I.first->second;
+  return *It->second;
 }
 
 bool LoopAccessInfoManager::invalidate(
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 704f92669a11..b83e2b435f5d 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -9198,8 +9198,25 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromICmp(
     // Since the loop is finite, an invariant RHS cannot include the boundary
     // value, otherwise it would loop forever.
     if (!EnableFiniteLoopControl || !ControllingFiniteLoop ||
-        !isLoopInvariant(RHS, L))
-      break;
+        !isLoopInvariant(RHS, L)) {
+      // Otherwise, perform the addition in a wider type, to avoid overflow.
+      // If the LHS is an addrec with the appropriate nowrap flag, the
+      // extension will be sunk into it and the exit count can be analyzed.
+      auto *OldType = dyn_cast<IntegerType>(LHS->getType());
+      if (!OldType)
+        break;
+      // Prefer doubling the bitwidth over adding a single bit to make it more
+      // likely that we use a legal type.
+      auto *NewType =
+          Type::getIntNTy(OldType->getContext(), OldType->getBitWidth() * 2);
+      if (ICmpInst::isSigned(Pred)) {
+        LHS = getSignExtendExpr(LHS, NewType);
+        RHS = getSignExtendExpr(RHS, NewType);
+      } else {
+        LHS = getZeroExtendExpr(LHS, NewType);
+        RHS = getZeroExtendExpr(RHS, NewType);
+      }
+    }
     RHS = getAddExpr(getOne(RHS->getType()), RHS);
     [[fallthrough]];
   case ICmpInst::ICMP_SLT:
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 592caf2d0e23..6b760fbde5bb 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -1204,26 +1204,31 @@ void TargetLibraryInfoImpl::addVectorizableFunctions(ArrayRef<VecDesc> Fns) {
 static const VecDesc VecFuncs_Accelerate[] = {
 #define TLI_DEFINE_ACCELERATE_VECFUNCS
 #include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_ACCELERATE_VECFUNCS
 };
 
 static const VecDesc VecFuncs_DarwinLibSystemM[] = {
 #define TLI_DEFINE_DARWIN_LIBSYSTEM_M_VECFUNCS
 #include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_DARWIN_LIBSYSTEM_M_VECFUNCS
 };
 
 static const VecDesc VecFuncs_LIBMVEC_X86[] = {
 #define TLI_DEFINE_LIBMVEC_X86_VECFUNCS
 #include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_LIBMVEC_X86_VECFUNCS
 };
 
 static const VecDesc VecFuncs_MASSV[] = {
 #define TLI_DEFINE_MASSV_VECFUNCS
 #include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_MASSV_VECFUNCS
 };
 
 static const VecDesc VecFuncs_SVML[] = {
 #define TLI_DEFINE_SVML_VECFUNCS
 #include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_SVML_VECFUNCS
 };
 
 static const VecDesc VecFuncs_SLEEFGNUABI_VF2[] = {
@@ -1231,18 +1236,21 @@ static const VecDesc VecFuncs_SLEEFGNUABI_VF2[] = {
 #define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, VABI_PREFIX)                         \
   {SCAL, VEC, VF, /* MASK = */ false, VABI_PREFIX},
 #include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_SLEEFGNUABI_VF2_VECFUNCS
 };
 static const VecDesc VecFuncs_SLEEFGNUABI_VF4[] = {
 #define TLI_DEFINE_SLEEFGNUABI_VF4_VECFUNCS
 #define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, VABI_PREFIX)                         \
   {SCAL, VEC, VF, /* MASK = */ false, VABI_PREFIX},
 #include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_SLEEFGNUABI_VF4_VECFUNCS
 };
 static const VecDesc VecFuncs_SLEEFGNUABI_VFScalable[] = {
 #define TLI_DEFINE_SLEEFGNUABI_SCALABLE_VECFUNCS
 #define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, MASK, VABI_PREFIX)                   \
   {SCAL, VEC, VF, MASK, VABI_PREFIX},
 #include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_SLEEFGNUABI_SCALABLE_VECFUNCS
 };
 
 static const VecDesc VecFuncs_ArmPL[] = {
@@ -1250,6 +1258,7 @@ static const VecDesc VecFuncs_ArmPL[] = {
 #define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, MASK, VABI_PREFIX)                   \
   {SCAL, VEC, VF, MASK, VABI_PREFIX},
 #include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_ARMPL_VECFUNCS
 };
 
 const VecDesc VecFuncs_AMDLIBM[] = {
@@ -1257,6 +1266,7 @@ const VecDesc VecFuncs_AMDLIBM[] = {
 #define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, MASK, VABI_PREFIX)                   \
   {SCAL, VEC, VF, MASK, VABI_PREFIX},
 #include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_AMDLIBM_VECFUNCS
 };
 
 void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib(
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index f6a458f7ded4..82b6d7e7c483 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1037,7 +1037,7 @@ TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
 
 InstructionCost TargetTransformInfo::getReplicationShuffleCost(
     Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts,
-    TTI::TargetCostKind CostKind) {
+    TTI::TargetCostKind CostKind) const {
   InstructionCost Cost = TTIImpl->getReplicationShuffleCost(
       EltTy, ReplicationFactor, VF, DemandedDstElts, CostKind);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index c4cea3d6eef2..c5fdd1116c9f 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -428,6 +428,11 @@ class IndexBitcodeWriter : public BitcodeWriterBase {
   /// The combined index to write to bitcode.
   const ModuleSummaryIndex &Index;
 
+  /// When writing combined summaries, provides the set of global value
+  /// summaries for which the value (function, function alias, etc) should be
+  /// imported as a declaration.
+  const GVSummaryPtrSet *DecSummaries = nullptr;
+
   /// When writing a subset of the index for distributed backends, client
   /// provides a map of modules to the corresponding GUIDs/summaries to write.
   const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex;
@@ -452,11 +457,16 @@ public:
   /// Constructs a IndexBitcodeWriter object for the given combined index,
   /// writing to the provided \p Buffer. When writing a subset of the index
   /// for a distributed backend, provide a \p ModuleToSummariesForIndex map.
+  /// If provided, \p ModuleToDecSummaries specifies the set of summaries for
+  /// which the corresponding functions or aliased functions should be imported
+  /// as a declaration (but not definition) for each module.
   IndexBitcodeWriter(BitstreamWriter &Stream, StringTableBuilder &StrtabBuilder,
                      const ModuleSummaryIndex &Index,
+                     const GVSummaryPtrSet *DecSummaries = nullptr,
                      const std::map<std::string, GVSummaryMapTy>
                          *ModuleToSummariesForIndex = nullptr)
       : BitcodeWriterBase(Stream, StrtabBuilder), Index(Index),
+        DecSummaries(DecSummaries),
         ModuleToSummariesForIndex(ModuleToSummariesForIndex) {
     // Assign unique value ids to all summaries to be written, for use
     // in writing out the call graph edges. Save the mapping from GUID
@@ -1202,7 +1212,8 @@ static uint64_t getEncodedFFlags(FunctionSummary::FFlags Flags) {
 
 // Decode the flags for GlobalValue in the summary. See getDecodedGVSummaryFlags
 // in BitcodeReader.cpp.
-static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) {
+static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags,
+                                         bool ImportAsDecl = false) {
   uint64_t RawFlags = 0;
 
   RawFlags |= Flags.NotEligibleToImport; // bool
@@ -1217,7 +1228,8 @@ static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) {
 
   RawFlags |= (Flags.Visibility << 8); // 2 bits
 
-  RawFlags |= (Flags.ImportType << 10); // 1 bit
+  unsigned ImportType = Flags.ImportType | ImportAsDecl;
+  RawFlags |= (ImportType << 10); // 1 bit
 
   return RawFlags;
 }
@@ -4543,6 +4555,12 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
   unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
+  auto shouldImportValueAsDecl = [&](GlobalValueSummary *GVS) -> bool {
+    if (DecSummaries == nullptr)
+      return false;
+    return DecSummaries->contains(GVS);
+  };
+
   // The aliases are emitted as a post-pass, and will point to the value
   // id of the aliasee. Save them in a vector for post-processing.
   SmallVector<AliasSummary *, 64> Aliases;
@@ -4653,7 +4671,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     NameVals.push_back(*ValueId);
     assert(ModuleIdMap.count(FS->modulePath()));
     NameVals.push_back(ModuleIdMap[FS->modulePath()]);
-    NameVals.push_back(getEncodedGVSummaryFlags(FS->flags()));
+    NameVals.push_back(
+        getEncodedGVSummaryFlags(FS->flags(), shouldImportValueAsDecl(FS)));
     NameVals.push_back(FS->instCount());
     NameVals.push_back(getEncodedFFlags(FS->fflags()));
     NameVals.push_back(FS->entryCount());
@@ -4702,7 +4721,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     NameVals.push_back(AliasValueId);
     assert(ModuleIdMap.count(AS->modulePath()));
     NameVals.push_back(ModuleIdMap[AS->modulePath()]);
-    NameVals.push_back(getEncodedGVSummaryFlags(AS->flags()));
+    NameVals.push_back(
+        getEncodedGVSummaryFlags(AS->flags(), shouldImportValueAsDecl(AS)));
     auto AliaseeValueId = SummaryToValueIdMap[&AS->getAliasee()];
     assert(AliaseeValueId);
     NameVals.push_back(AliaseeValueId);
@@ -5036,8 +5056,9 @@ void BitcodeWriter::writeModule(const Module &M,
 
 void BitcodeWriter::writeIndex(
     const ModuleSummaryIndex *Index,
-    const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex) {
-  IndexBitcodeWriter IndexWriter(*Stream, StrtabBuilder, *Index,
+    const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex,
+    const GVSummaryPtrSet *DecSummaries) {
+  IndexBitcodeWriter IndexWriter(*Stream, StrtabBuilder, *Index, DecSummaries,
                                  ModuleToSummariesForIndex);
   IndexWriter.write();
 }
@@ -5090,12 +5111,13 @@ void IndexBitcodeWriter::write() {
 // index for a distributed backend, provide a \p ModuleToSummariesForIndex map.
 void llvm::writeIndexToFile(
     const ModuleSummaryIndex &Index, raw_ostream &Out,
-    const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex) {
+    const std::map<std::string, GVSummaryMapTy> *ModuleToSummariesForIndex,
+    const GVSummaryPtrSet *DecSummaries) {
   SmallVector<char, 0> Buffer;
   Buffer.reserve(256 * 1024);
 
   BitcodeWriter Writer(Buffer);
-  Writer.writeIndex(&Index, ModuleToSummariesForIndex);
+  Writer.writeIndex(&Index, ModuleToSummariesForIndex, DecSummaries);
   Writer.writeStrtab();
 
   Out.write((char *)&Buffer.front(), Buffer.size());
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 6022afbae574..c1e7f01f0eba 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -1539,8 +1539,8 @@ void DwarfCompileUnit::addGlobalNameForTypeUnit(StringRef Name,
 }
 
 /// Add a new global type to the unit.
-void DwarfCompileUnit::addGlobalType(const DIType *Ty, const DIE &Die,
-                                     const DIScope *Context) {
+void DwarfCompileUnit::addGlobalTypeImpl(const DIType *Ty, const DIE &Die,
+                                         const DIScope *Context) {
   if (!hasDwarfPubSections())
     return;
   std::string FullName = getParentContextString(Context) + Ty->getName().str();
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index dc772bb459c9..76584b3eb8e7 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -335,8 +335,8 @@ public:
   void addGlobalNameForTypeUnit(StringRef Name, const DIScope *Context);
 
   /// Add a new global type to the compile unit.
-  void addGlobalType(const DIType *Ty, const DIE &Die,
-                     const DIScope *Context) override;
+  void addGlobalTypeImpl(const DIType *Ty, const DIE &Die,
+                         const DIScope *Context) override;
 
   /// Add a new global type present in a type unit to this compile unit.
   void addGlobalTypeUnitType(const DIType *Ty, const DIScope *Context);
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 1e33c2729e5d..6c04fa1c67a9 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -578,28 +578,33 @@ DIE *DwarfUnit::createTypeDIE(const DIScope *Context, DIE &ContextDIE,
   // Create new type.
   DIE &TyDIE = createAndAddDIE(Ty->getTag(), ContextDIE, Ty);
 
-  updateAcceleratorTables(Context, Ty, TyDIE);
+  auto construct = [&](const auto *Ty) {
+    updateAcceleratorTables(Context, Ty, TyDIE);
+    constructTypeDIE(TyDIE, Ty);
+  };
 
-  if (auto *BT = dyn_cast<DIBasicType>(Ty))
-    constructTypeDIE(TyDIE, BT);
-  else if (auto *ST = dyn_cast<DIStringType>(Ty))
-    constructTypeDIE(TyDIE, ST);
-  else if (auto *STy = dyn_cast<DISubroutineType>(Ty))
-    constructTypeDIE(TyDIE, STy);
-  else if (auto *CTy = dyn_cast<DICompositeType>(Ty)) {
+  if (auto *CTy = dyn_cast<DICompositeType>(Ty)) {
     if (DD->generateTypeUnits() && !Ty->isForwardDecl() &&
         (Ty->getRawName() || CTy->getRawIdentifier())) {
       // Skip updating the accelerator tables since this is not the full type.
-      if (MDString *TypeId = CTy->getRawIdentifier())
+      if (MDString *TypeId = CTy->getRawIdentifier()) {
+        addGlobalType(Ty, TyDIE, Context);
         DD->addDwarfTypeUnitType(getCU(), TypeId->getString(), TyDIE, CTy);
-      else
+      } else {
+        updateAcceleratorTables(Context, Ty, TyDIE);
         finishNonUnitTypeDIE(TyDIE, CTy);
+      }
       return &TyDIE;
     }
-    constructTypeDIE(TyDIE, CTy);
-  } else {
-    constructTypeDIE(TyDIE, cast<DIDerivedType>(Ty));
-  }
+    construct(CTy);
+  } else if (auto *BT = dyn_cast<DIBasicType>(Ty))
+    construct(BT);
+  else if (auto *ST = dyn_cast<DIStringType>(Ty))
+    construct(ST);
+  else if (auto *STy = dyn_cast<DISubroutineType>(Ty))
+    construct(STy);
+  else
+    construct(cast<DIDerivedType>(Ty));
 
   return &TyDIE;
 }
@@ -633,21 +638,31 @@ DIE *DwarfUnit::getOrCreateTypeDIE(const MDNode *TyNode) {
 
 void DwarfUnit::updateAcceleratorTables(const DIScope *Context,
                                         const DIType *Ty, const DIE &TyDIE) {
-  if (!Ty->getName().empty() && !Ty->isForwardDecl()) {
-    bool IsImplementation = false;
-    if (auto *CT = dyn_cast<DICompositeType>(Ty)) {
-      // A runtime language of 0 actually means C/C++ and that any
-      // non-negative value is some version of Objective-C/C++.
-      IsImplementation = CT->getRuntimeLang() == 0 || CT->isObjcClassComplete();
-    }
-    unsigned Flags = IsImplementation ? dwarf::DW_FLAG_type_implementation : 0;
-    DD->addAccelType(*this, CUNode->getNameTableKind(), Ty->getName(), TyDIE,
-                     Flags);
+  if (Ty->getName().empty())
+    return;
+  if (Ty->isForwardDecl())
+    return;
 
-    if (!Context || isa<DICompileUnit>(Context) || isa<DIFile>(Context) ||
-        isa<DINamespace>(Context) || isa<DICommonBlock>(Context))
-      addGlobalType(Ty, TyDIE, Context);
+  // add temporary record for this type to be added later
+
+  bool IsImplementation = false;
+  if (auto *CT = dyn_cast<DICompositeType>(Ty)) {
+    // A runtime language of 0 actually means C/C++ and that any
+    // non-negative value is some version of Objective-C/C++.
+    IsImplementation = CT->getRuntimeLang() == 0 || CT->isObjcClassComplete();
   }
+  unsigned Flags = IsImplementation ? dwarf::DW_FLAG_type_implementation : 0;
+  DD->addAccelType(*this, CUNode->getNameTableKind(), Ty->getName(), TyDIE,
+                   Flags);
+
+  addGlobalType(Ty, TyDIE, Context);
+}
+
+void DwarfUnit::addGlobalType(const DIType *Ty, const DIE &TyDIE,
+                              const DIScope *Context) {
+  if (!Context || isa<DICompileUnit>(Context) || isa<DIFile>(Context) ||
+      isa<DINamespace>(Context) || isa<DICommonBlock>(Context))
+    addGlobalTypeImpl(Ty, TyDIE, Context);
 }
 
 void DwarfUnit::addType(DIE &Entity, const DIType *Ty,
@@ -1844,8 +1859,8 @@ void DwarfTypeUnit::addGlobalName(StringRef Name, const DIE &Die,
   getCU().addGlobalNameForTypeUnit(Name, Context);
 }
 
-void DwarfTypeUnit::addGlobalType(const DIType *Ty, const DIE &Die,
-                                  const DIScope *Context) {
+void DwarfTypeUnit::addGlobalTypeImpl(const DIType *Ty, const DIE &Die,
+                                      const DIScope *Context) {
   getCU().addGlobalTypeUnitType(Ty, Context);
 }
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 18f50f86ec87..02256546b6b8 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -128,8 +128,10 @@ public:
                              const DIScope *Context) = 0;
 
   /// Add a new global type to the compile unit.
-  virtual void addGlobalType(const DIType *Ty, const DIE &Die,
-                             const DIScope *Context) = 0;
+  virtual void addGlobalTypeImpl(const DIType *Ty, const DIE &Die,
+                                 const DIScope *Context) = 0;
+
+  void addGlobalType(const DIType *Ty, const DIE &Die, const DIScope *Context);
 
   /// Returns the DIE map slot for the specified debug variable.
   ///
@@ -397,8 +399,8 @@ public:
   }
   void addGlobalName(StringRef Name, const DIE &Die,
                      const DIScope *Context) override;
-  void addGlobalType(const DIType *Ty, const DIE &Die,
-                     const DIScope *Context) override;
+  void addGlobalTypeImpl(const DIType *Ty, const DIE &Die,
+                         const DIScope *Context) override;
   DwarfCompileUnit &getCU() override { return CU; }
 };
 } // end llvm namespace
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index ee44e9353d04..d2b756e82964 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -37,6 +37,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
@@ -937,6 +938,36 @@ void AtomicExpandImpl::expandPartwordAtomicRMW(
   AI->eraseFromParent();
 }
 
+/// Copy metadata that's safe to preserve when widening atomics.
+static void copyMetadataForAtomic(Instruction &Dest,
+                                  const Instruction &Source) {
+  SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
+  Source.getAllMetadata(MD);
+  LLVMContext &Ctx = Dest.getContext();
+  MDBuilder MDB(Ctx);
+
+  for (auto [ID, N] : MD) {
+    switch (ID) {
+    case LLVMContext::MD_dbg:
+    case LLVMContext::MD_tbaa:
+    case LLVMContext::MD_tbaa_struct:
+    case LLVMContext::MD_alias_scope:
+    case LLVMContext::MD_noalias:
+    case LLVMContext::MD_access_group:
+    case LLVMContext::MD_mmra:
+      Dest.setMetadata(ID, N);
+      break;
+    default:
+      if (ID == Ctx.getMDKindID("amdgpu.no.remote.memory"))
+        Dest.setMetadata(ID, N);
+      else if (ID == Ctx.getMDKindID("amdgpu.no.fine.grained.memory"))
+        Dest.setMetadata(ID, N);
+
+      break;
+    }
+  }
+}
+
 // Widen the bitwise atomicrmw (or/xor/and) to the minimum supported width.
 AtomicRMWInst *AtomicExpandImpl::widenPartwordAtomicRMW(AtomicRMWInst *AI) {
   ReplacementIRBuilder Builder(AI, *DL);
@@ -965,7 +996,8 @@ AtomicRMWInst *AtomicExpandImpl::widenPartwordAtomicRMW(AtomicRMWInst *AI) {
   AtomicRMWInst *NewAI = Builder.CreateAtomicRMW(
       Op, PMV.AlignedAddr, NewOperand, PMV.AlignedAddrAlignment,
       AI->getOrdering(), AI->getSyncScopeID());
-  // TODO: Preserve metadata
+
+  copyMetadataForAtomic(*NewAI, *AI);
 
   Value *FinalOldResult = extractMaskedValue(Builder, NewAI, PMV);
   AI->replaceAllUsesWith(FinalOldResult);
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 22eb4a3e0d7c..4cc602b5c870 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -223,6 +223,70 @@ void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
   replaceRegWith(MRI, DstReg, SrcReg);
 }
 
+bool CombinerHelper::matchFreezeOfSingleMaybePoisonOperand(
+    MachineInstr &MI, BuildFnTy &MatchInfo) {
+  // Ported from InstCombinerImpl::pushFreezeToPreventPoisonFromPropagating.
+  Register DstOp = MI.getOperand(0).getReg();
+  Register OrigOp = MI.getOperand(1).getReg();
+
+  if (!MRI.hasOneNonDBGUse(OrigOp))
+    return false;
+
+  MachineInstr *OrigDef = MRI.getUniqueVRegDef(OrigOp);
+  // Even if only a single operand of the PHI is not guaranteed non-poison,
+  // moving freeze() backwards across a PHI can cause optimization issues for
+  // other users of that operand.
+  //
+  // Moving freeze() from one of the output registers of a G_UNMERGE_VALUES to
+  // the source register is unprofitable because it makes the freeze() more
+  // strict than is necessary (it would affect the whole register instead of
+  // just the subreg being frozen).
+  if (OrigDef->isPHI() || isa<GUnmerge>(OrigDef))
+    return false;
+
+  if (canCreateUndefOrPoison(OrigOp, MRI,
+                             /*ConsiderFlagsAndMetadata=*/false))
+    return false;
+
+  std::optional<MachineOperand> MaybePoisonOperand;
+  for (MachineOperand &Operand : OrigDef->uses()) {
+    if (!Operand.isReg())
+      return false;
+
+    if (isGuaranteedNotToBeUndefOrPoison(Operand.getReg(), MRI))
+      continue;
+
+    if (!MaybePoisonOperand)
+      MaybePoisonOperand = Operand;
+    else {
+      // We have more than one maybe-poison operand. Moving the freeze is
+      // unsafe.
+      return false;
+    }
+  }
+
+  cast<GenericMachineInstr>(OrigDef)->dropPoisonGeneratingFlags();
+
+  // Eliminate freeze if all operands are guaranteed non-poison.
+  if (!MaybePoisonOperand) {
+    MatchInfo = [=](MachineIRBuilder &B) { MRI.replaceRegWith(DstOp, OrigOp); };
+    return true;
+  }
+
+  Register MaybePoisonOperandReg = MaybePoisonOperand->getReg();
+  LLT MaybePoisonOperandRegTy = MRI.getType(MaybePoisonOperandReg);
+
+  MatchInfo = [=](MachineIRBuilder &B) mutable {
+    B.setInsertPt(*OrigDef->getParent(), OrigDef->getIterator());
+    auto Freeze = B.buildFreeze(MaybePoisonOperandRegTy, MaybePoisonOperandReg);
+    replaceRegOpWith(
+        MRI, *OrigDef->findRegisterUseOperand(MaybePoisonOperandReg, TRI),
+        Freeze.getReg(0));
+    replaceRegWith(MRI, DstOp, OrigOp);
+  };
+  return true;
+}
+
 bool CombinerHelper::matchCombineConcatVectors(MachineInstr &MI,
                                                SmallVector<Register> &Ops) {
   assert(MI.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
diff --git a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
index 14e1e1fdf01d..5acf35b37882 100644
--- a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
@@ -538,6 +538,13 @@ bool InlineAsmLowering::lowerInlineAsm(
     }
   }
 
+  // Add rounding control registers as implicit def for inline asm.
+  if (MF.getFunction().hasFnAttribute(Attribute::StrictFP)) {
+    ArrayRef<MCPhysReg> RCRegs = TLI->getRoundingControlRegisters();
+    for (MCPhysReg Reg : RCRegs)
+      Inst.addReg(Reg, RegState::ImplicitDefine);
+  }
+
   if (auto Bundle = Call.getOperandBundle(LLVMContext::OB_convergencectrl)) {
     auto *Token = Bundle->Inputs[0].get();
     ArrayRef<Register> SourceRegs = GetOrCreateVRegs(*Token);
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 40507845d8d8..d8b0f52ecf9e 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -1296,7 +1296,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     MI.eraseFromParent();
     return Legalized;
   }
-
+  case TargetOpcode::G_CONSTANT_FOLD_BARRIER:
   case TargetOpcode::G_FREEZE: {
     if (TypeIdx != 0)
       return UnableToLegalize;
@@ -1310,7 +1310,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     SmallVector<Register, 8> Parts;
     for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) {
       Parts.push_back(
-          MIRBuilder.buildFreeze(NarrowTy, Unmerge.getReg(i)).getReg(0));
+          MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy}, {Unmerge.getReg(i)})
+              .getReg(0));
     }
 
     MIRBuilder.buildMergeLikeInstr(MI.getOperand(0).getReg(), Parts);
@@ -2515,6 +2516,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     return Legalized;
   }
   case TargetOpcode::G_FREEZE:
+  case TargetOpcode::G_CONSTANT_FOLD_BARRIER:
     Observer.changingInstr(MI);
     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
     widenScalarDst(MI, WideTy);
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index cd5dc0e01ed0..f455482e0294 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -1745,11 +1745,20 @@ static bool canCreateUndefOrPoison(Register Reg, const MachineRegisterInfo &MRI,
                                    UndefPoisonKind Kind) {
   MachineInstr *RegDef = MRI.getVRegDef(Reg);
 
+  if (auto *GMI = dyn_cast<GenericMachineInstr>(RegDef)) {
+    if (ConsiderFlagsAndMetadata && includesPoison(Kind) &&
+        GMI->hasPoisonGeneratingFlags())
+      return true;
+  } else {
+    // Conservatively return true.
+    return true;
+  }
+
   switch (RegDef->getOpcode()) {
   case TargetOpcode::G_FREEZE:
     return false;
   default:
-    return true;
+    return !isa<GCastOp>(RegDef) && !isa<GBinOp>(RegDef);
   }
 }
 
@@ -1767,8 +1776,17 @@ static bool isGuaranteedNotToBeUndefOrPoison(Register Reg,
     return true;
   case TargetOpcode::G_IMPLICIT_DEF:
     return !includesUndef(Kind);
-  default:
-    return false;
+  default: {
+    auto MOCheck = [&](const MachineOperand &MO) {
+      if (!MO.isReg())
+        return true;
+      return ::isGuaranteedNotToBeUndefOrPoison(MO.getReg(), MRI, Depth + 1,
+                                                Kind);
+    };
+    return !::canCreateUndefOrPoison(Reg, MRI,
+                                     /*ConsiderFlagsAndMetadata=*/true, Kind) &&
+           all_of(RegDef->uses(), MOCheck);
+  }
   }
 }
 
diff --git a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
index a9b59e738c00..fc4be84bca10 100644
--- a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
+++ b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
@@ -64,10 +64,10 @@ struct VectorInfo;
 struct InterleavedLoadCombineImpl {
 public:
   InterleavedLoadCombineImpl(Function &F, DominatorTree &DT, MemorySSA &MSSA,
+                             const TargetTransformInfo &TTI,
                              const TargetMachine &TM)
       : F(F), DT(DT), MSSA(MSSA),
-        TLI(*TM.getSubtargetImpl(F)->getTargetLowering()),
-        TTI(TM.getTargetTransformInfo(F)) {}
+        TLI(*TM.getSubtargetImpl(F)->getTargetLowering()), TTI(TTI) {}
 
   /// Scan the function for interleaved load candidates and execute the
   /// replacement if applicable.
@@ -87,7 +87,7 @@ private:
   const TargetLowering &TLI;
 
   /// Target Transform Information
-  const TargetTransformInfo TTI;
+  const TargetTransformInfo &TTI;
 
   /// Find the instruction in sets LIs that dominates all others, return nullptr
   /// if there is none.
@@ -1329,6 +1329,7 @@ struct InterleavedLoadCombine : public FunctionPass {
     return InterleavedLoadCombineImpl(
                F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
                getAnalysis<MemorySSAWrapperPass>().getMSSA(),
+               getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
                TPC->getTM<TargetMachine>())
         .run();
   }
@@ -1336,6 +1337,7 @@ struct InterleavedLoadCombine : public FunctionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MemorySSAWrapperPass>();
     AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
 
@@ -1348,7 +1350,8 @@ InterleavedLoadCombinePass::run(Function &F, FunctionAnalysisManager &FAM) {
 
   auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
   auto &MemSSA = FAM.getResult<MemorySSAAnalysis>(F).getMSSA();
-  bool Changed = InterleavedLoadCombineImpl(F, DT, MemSSA, *TM).run();
+  auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
+  bool Changed = InterleavedLoadCombineImpl(F, DT, MemSSA, TTI, *TM).run();
   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
 
@@ -1360,6 +1363,7 @@ INITIALIZE_PASS_BEGIN(
     false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(
     InterleavedLoadCombine, DEBUG_TYPE,
     "Combine interleaved loads into wide loads and shufflevector instructions",
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 643370f0573d..7b7b5459ad7b 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -414,7 +414,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
       DeadRemats->insert(MI);
       const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
       MI->substituteRegister(Dest, NewLI.reg(), 0, TRI);
-      MI->getOperand(0).setIsDead(true);
+      assert(MI->registerDefIsDead(NewLI.reg(), &TRI));
     } else {
       if (TheDelegate)
         TheDelegate->LRE_WillEraseInstruction(MI);
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 78d581c8cead..03e892a5e0d2 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1664,7 +1664,8 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
     if (ShouldTrackPressure) {
       // Update top scheduled pressure.
       RegisterOperands RegOpers;
-      RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
+      RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks,
+                       /*IgnoreDead=*/false);
       if (ShouldTrackLaneMasks) {
         // Adjust liveness and add missing dead+read-undef flags.
         SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
@@ -1698,7 +1699,8 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
     }
     if (ShouldTrackPressure) {
       RegisterOperands RegOpers;
-      RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
+      RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks,
+                       /*IgnoreDead=*/false);
       if (ShouldTrackLaneMasks) {
         // Adjust liveness and add missing dead+read-undef flags.
         SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
@@ -3775,6 +3777,21 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
     }
   } while (SU->isScheduled);
 
+  // If IsTopNode, then SU is in Top.Available and must be removed. Otherwise,
+  // if isTopReady(), then SU is in either Top.Available or Top.Pending.
+  // If !IsTopNode, then SU is in Bot.Available and must be removed. Otherwise,
+  // if isBottomReady(), then SU is in either Bot.Available or Bot.Pending.
+  //
+  // It is coincidental when !IsTopNode && isTopReady or when IsTopNode &&
+  // isBottomReady. That is, it didn't factor into the decision to choose SU
+  // because it isTopReady or isBottomReady, respectively. In fact, if the
+  // RegionPolicy is OnlyTopDown or OnlyBottomUp, then the Bot queues and Top
+  // queues respectivley contain the original roots and don't get updated when
+  // picking a node. So if SU isTopReady on a OnlyBottomUp pick, then it was
+  // because we schduled everything but the top roots. Conversley, if SU
+  // isBottomReady on OnlyTopDown, then it was because we scheduled everything
+  // but the bottom roots. If its in a queue even coincidentally, it should be
+  // removed so it does not get re-picked in a subsequent pickNode call.
   if (SU->isTopReady())
     Top.removeReady(SU);
   if (SU->isBottomReady())
diff --git a/llvm/lib/CodeGen/RegisterPressure.cpp b/llvm/lib/CodeGen/RegisterPressure.cpp
index 3fa22447f416..9a7eb49666b2 100644
--- a/llvm/lib/CodeGen/RegisterPressure.cpp
+++ b/llvm/lib/CodeGen/RegisterPressure.cpp
@@ -873,7 +873,7 @@ void RegPressureTracker::recede(SmallVectorImpl<RegisterMaskPair> *LiveUses) {
 
   const MachineInstr &MI = *CurrPos;
   RegisterOperands RegOpers;
-  RegOpers.collect(MI, *TRI, *MRI, TrackLaneMasks, false);
+  RegOpers.collect(MI, *TRI, *MRI, TrackLaneMasks, /*IgnoreDead=*/false);
   if (TrackLaneMasks) {
     SlotIndex SlotIdx = LIS->getInstructionIndex(*CurrPos).getRegSlot();
     RegOpers.adjustLaneLiveness(*LIS, *MRI, SlotIdx);
@@ -1041,7 +1041,7 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
   // Account for register pressure similar to RegPressureTracker::recede().
   RegisterOperands RegOpers;
   RegOpers.collect(*MI, *TRI, *MRI, TrackLaneMasks, /*IgnoreDead=*/true);
-  assert(RegOpers.DeadDefs.size() == 0);
+  assert(RegOpers.DeadDefs.empty());
   if (TrackLaneMasks)
     RegOpers.adjustLaneLiveness(*LIS, *MRI, SlotIdx);
   else if (RequireIntervals)
@@ -1290,7 +1290,7 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
 
   // Account for register pressure similar to RegPressureTracker::recede().
   RegisterOperands RegOpers;
-  RegOpers.collect(*MI, *TRI, *MRI, TrackLaneMasks, false);
+  RegOpers.collect(*MI, *TRI, *MRI, TrackLaneMasks, /*IgnoreDead=*/false);
   if (TrackLaneMasks)
     RegOpers.adjustLaneLiveness(*LIS, *MRI, SlotIdx);
 
diff --git a/llvm/lib/CodeGen/ScheduleDAG.cpp b/llvm/lib/CodeGen/ScheduleDAG.cpp
index de8e6f63794d..8d9a5041fc2f 100644
--- a/llvm/lib/CodeGen/ScheduleDAG.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAG.cpp
@@ -331,8 +331,10 @@ void SUnit::biasCriticalPath() {
   unsigned MaxDepth = BestI->getSUnit()->getDepth();
   for (SUnit::pred_iterator I = std::next(BestI), E = Preds.end(); I != E;
        ++I) {
-    if (I->getKind() == SDep::Data && I->getSUnit()->getDepth() > MaxDepth)
+    if (I->getKind() == SDep::Data && I->getSUnit()->getDepth() > MaxDepth) {
+      MaxDepth = I->getSUnit()->getDepth();
       BestI = I;
+    }
   }
   if (BestI != Preds.begin())
     std::swap(*Preds.begin(), *BestI);
diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
index 2e03ae6aec94..0a5f0a861d48 100644
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -130,7 +130,11 @@ public:
   class SelectLike {
     SelectLike(Instruction *I) : I(I) {}
 
+    /// The select (/or) instruction.
     Instruction *I;
+    /// Whether this select is inverted, "not(cond), FalseVal, TrueVal", as
+    /// opposed to the original condition.
+    bool Inverted = false;
 
   public:
     /// Match a select or select-like instruction, returning a SelectLike.
@@ -153,14 +157,22 @@ public:
     bool isValid() { return I; }
     operator bool() { return isValid(); }
 
+    /// Invert the select by inverting the condition and switching the operands.
+    void setInverted() {
+      assert(!Inverted && "Trying to invert an inverted SelectLike");
+      assert(isa<Instruction>(getCondition()) &&
+             cast<Instruction>(getCondition())->getOpcode() ==
+                 Instruction::Xor);
+      Inverted = true;
+    }
+    bool isInverted() const { return Inverted; }
+
     Instruction *getI() { return I; }
     const Instruction *getI() const { return I; }
 
     Type *getType() const { return I->getType(); }
 
-    /// Return the condition for the SelectLike instruction. For example the
-    /// condition of a select or c in `or(zext(c), x)`
-    Value *getCondition() const {
+    Value *getNonInvertedCondition() const {
       if (auto *Sel = dyn_cast<SelectInst>(I))
         return Sel->getCondition();
       // Or(zext) case
@@ -177,11 +189,24 @@ public:
       llvm_unreachable("Unhandled case in getCondition");
     }
 
+    /// Return the condition for the SelectLike instruction. For example the
+    /// condition of a select or c in `or(zext(c), x)`
+    Value *getCondition() const {
+      Value *CC = getNonInvertedCondition();
+      // For inverted conditions the CC is checked when created to be a not
+      // (xor) instruction.
+      if (Inverted)
+        return cast<Instruction>(CC)->getOperand(0);
+      return CC;
+    }
+
     /// Return the true value for the SelectLike instruction. Note this may not
     /// exist for all SelectLike instructions. For example, for `or(zext(c), x)`
     /// the true value would be `or(x,1)`. As this value does not exist, nullptr
     /// is returned.
-    Value *getTrueValue() const {
+    Value *getTrueValue(bool HonorInverts = true) const {
+      if (Inverted && HonorInverts)
+        return getFalseValue(/*HonorInverts=*/false);
       if (auto *Sel = dyn_cast<SelectInst>(I))
         return Sel->getTrueValue();
       // Or(zext) case - The true value is Or(X), so return nullptr as the value
@@ -195,7 +220,9 @@ public:
     /// Return the false value for the SelectLike instruction. For example the
     /// getFalseValue of a select or `x` in `or(zext(c), x)` (which is
     /// `select(c, x|1, x)`)
-    Value *getFalseValue() const {
+    Value *getFalseValue(bool HonorInverts = true) const {
+      if (Inverted && HonorInverts)
+        return getTrueValue(/*HonorInverts=*/false);
       if (auto *Sel = dyn_cast<SelectInst>(I))
         return Sel->getFalseValue();
       // Or(zext) case - return the operand which is not the zext.
@@ -216,8 +243,8 @@ public:
     /// InstCostMap. This may need to be generated for select-like instructions.
     Scaled64 getTrueOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap,
                            const TargetTransformInfo *TTI) {
-      if (auto *Sel = dyn_cast<SelectInst>(I))
-        if (auto *I = dyn_cast<Instruction>(Sel->getTrueValue()))
+      if (isa<SelectInst>(I))
+        if (auto *I = dyn_cast<Instruction>(getTrueValue()))
           return InstCostMap.contains(I) ? InstCostMap[I].NonPredCost
                                          : Scaled64::getZero();
 
@@ -242,8 +269,8 @@ public:
     Scaled64
     getFalseOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap,
                    const TargetTransformInfo *TTI) {
-      if (auto *Sel = dyn_cast<SelectInst>(I))
-        if (auto *I = dyn_cast<Instruction>(Sel->getFalseValue()))
+      if (isa<SelectInst>(I))
+        if (auto *I = dyn_cast<Instruction>(getFalseValue()))
           return InstCostMap.contains(I) ? InstCostMap[I].NonPredCost
                                          : Scaled64::getZero();
 
@@ -510,9 +537,10 @@ getTrueOrFalseValue(SelectOptimizeImpl::SelectLike SI, bool isTrue,
   for (SelectInst *DefSI = dyn_cast<SelectInst>(SI.getI());
        DefSI != nullptr && Selects.count(DefSI);
        DefSI = dyn_cast<SelectInst>(V)) {
-    assert(DefSI->getCondition() == SI.getCondition() &&
-           "The condition of DefSI does not match with SI");
-    V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
+    if (DefSI->getCondition() == SI.getCondition())
+      V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
+    else // Handle inverted SI
+      V = (!isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
   }
 
   if (isa<BinaryOperator>(SI.getI())) {
@@ -632,18 +660,19 @@ void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
     // Delete the unconditional branch that was just created by the split.
     StartBlock->getTerminator()->eraseFromParent();
 
-    // Move any debug/pseudo instructions that were in-between the select
-    // group to the newly-created end block.
-    SmallVector<Instruction *, 2> DebugPseudoINS;
+    // Move any debug/pseudo instructions and not's that were in-between the
+    // select group to the newly-created end block.
+    SmallVector<Instruction *, 2> SinkInstrs;
     auto DIt = SI.getI()->getIterator();
     while (&*DIt != LastSI.getI()) {
       if (DIt->isDebugOrPseudoInst())
-        DebugPseudoINS.push_back(&*DIt);
+        SinkInstrs.push_back(&*DIt);
+      if (match(&*DIt, m_Not(m_Specific(SI.getCondition()))))
+        SinkInstrs.push_back(&*DIt);
       DIt++;
     }
-    for (auto *DI : DebugPseudoINS) {
+    for (auto *DI : SinkInstrs)
       DI->moveBeforePreserving(&*EndBlock->getFirstInsertionPt());
-    }
 
     // Duplicate implementation for DbgRecords, the non-instruction debug-info
     // format. Helper lambda for moving DbgRecords to the end block.
@@ -765,6 +794,13 @@ void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB,
           ++BBIt;
           continue;
         }
+
+        // Skip not(select(..)), if the not is part of the same select group
+        if (match(NI, m_Not(m_Specific(SI.getCondition())))) {
+          ++BBIt;
+          continue;
+        }
+
         // We only allow selects in the same group, not other select-like
         // instructions.
         if (!isa<SelectInst>(NI))
@@ -773,6 +809,10 @@ void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB,
         SelectLike NSI = SelectLike::match(NI);
         if (NSI && SI.getCondition() == NSI.getCondition()) {
           SIGroup.push_back(NSI);
+        } else if (NSI && match(NSI.getCondition(),
+                                m_Not(m_Specific(SI.getCondition())))) {
+          NSI.setInverted();
+          SIGroup.push_back(NSI);
         } else
           break;
         ++BBIt;
@@ -783,6 +823,12 @@ void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB,
       if (!isSelectKindSupported(SI))
         continue;
 
+      LLVM_DEBUG({
+        dbgs() << "New Select group with\n";
+        for (auto SI : SIGroup)
+          dbgs() << "  " << *SI.getI() << "\n";
+      });
+
       SIGroups.push_back(SIGroup);
     }
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 8607b5017535..93d866384b48 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10745,6 +10745,7 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
   SDValue N2 = N->getOperand(2);
   bool IsFSHL = N->getOpcode() == ISD::FSHL;
   unsigned BitWidth = VT.getScalarSizeInBits();
+  SDLoc DL(N);
 
   // fold (fshl N0, N1, 0) -> N0
   // fold (fshr N0, N1, 0) -> N1
@@ -10764,8 +10765,8 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
     // fold (fsh* N0, N1, c) -> (fsh* N0, N1, c % BitWidth)
     if (Cst->getAPIntValue().uge(BitWidth)) {
       uint64_t RotAmt = Cst->getAPIntValue().urem(BitWidth);
-      return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0, N1,
-                         DAG.getConstant(RotAmt, SDLoc(N), ShAmtTy));
+      return DAG.getNode(N->getOpcode(), DL, VT, N0, N1,
+                         DAG.getConstant(RotAmt, DL, ShAmtTy));
     }
 
     unsigned ShAmt = Cst->getZExtValue();
@@ -10777,13 +10778,13 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
     // fold fshl(N0, undef_or_zero, C) -> shl(N0, C)
     // fold fshr(N0, undef_or_zero, C) -> shl(N0, BW-C)
     if (IsUndefOrZero(N0))
-      return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1,
-                         DAG.getConstant(IsFSHL ? BitWidth - ShAmt : ShAmt,
-                                         SDLoc(N), ShAmtTy));
+      return DAG.getNode(
+          ISD::SRL, DL, VT, N1,
+          DAG.getConstant(IsFSHL ? BitWidth - ShAmt : ShAmt, DL, ShAmtTy));
     if (IsUndefOrZero(N1))
-      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0,
-                         DAG.getConstant(IsFSHL ? ShAmt : BitWidth - ShAmt,
-                                         SDLoc(N), ShAmtTy));
+      return DAG.getNode(
+          ISD::SHL, DL, VT, N0,
+          DAG.getConstant(IsFSHL ? ShAmt : BitWidth - ShAmt, DL, ShAmtTy));
 
     // fold (fshl ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
     // fold (fshr ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
@@ -10832,18 +10833,19 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
   if (isPowerOf2_32(BitWidth)) {
     APInt ModuloBits(N2.getScalarValueSizeInBits(), BitWidth - 1);
     if (IsUndefOrZero(N0) && !IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits))
-      return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1, N2);
+      return DAG.getNode(ISD::SRL, DL, VT, N1, N2);
     if (IsUndefOrZero(N1) && IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits))
-      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N2);
+      return DAG.getNode(ISD::SHL, DL, VT, N0, N2);
   }
 
   // fold (fshl N0, N0, N2) -> (rotl N0, N2)
   // fold (fshr N0, N0, N2) -> (rotr N0, N2)
-  // TODO: Investigate flipping this rotate if only one is legal, if funnel shift
-  // is legal as well we might be better off avoiding non-constant (BW - N2).
+  // TODO: Investigate flipping this rotate if only one is legal.
+  // If funnel shift is legal as well we might be better off avoiding
+  // non-constant (BW - N2).
   unsigned RotOpc = IsFSHL ? ISD::ROTL : ISD::ROTR;
   if (N0 == N1 && hasOperation(RotOpc, VT))
-    return DAG.getNode(RotOpc, SDLoc(N), VT, N0, N2);
+    return DAG.getNode(RotOpc, DL, VT, N0, N2);
 
   // Simplify, based on bits shifted out of N0/N1.
   if (SimplifyDemandedBits(SDValue(N, 0)))
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 759368a67a16..36738961382e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -1412,6 +1412,13 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
       }
     }
 
+    // Add rounding control registers as implicit def for inline asm.
+    if (MF->getFunction().hasFnAttribute(Attribute::StrictFP)) {
+      ArrayRef<MCPhysReg> RCRegs = TLI->getRoundingControlRegisters();
+      for (MCPhysReg Reg : RCRegs)
+        MIB.addReg(Reg, RegState::ImplicitDefine);
+    }
+
     // GCC inline assembly allows input operands to also be early-clobber
     // output operands (so long as the operand is written only after it's
     // used), but this does not match the semantics of our early-clobber flag.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index c64e27fe4563..8fda35f00863 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -107,9 +107,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::SIGN_EXTEND_INREG:
                          Res = PromoteIntRes_SIGN_EXTEND_INREG(N); break;
   case ISD::SRA:
-  case ISD::VP_ASHR:     Res = PromoteIntRes_SRA(N); break;
+  case ISD::VP_SRA:      Res = PromoteIntRes_SRA(N); break;
   case ISD::SRL:
-  case ISD::VP_LSHR:     Res = PromoteIntRes_SRL(N); break;
+  case ISD::VP_SRL:      Res = PromoteIntRes_SRL(N); break;
   case ISD::VP_TRUNCATE:
   case ISD::TRUNCATE:    Res = PromoteIntRes_TRUNCATE(N); break;
   case ISD::UNDEF:       Res = PromoteIntRes_UNDEF(N); break;
@@ -573,7 +573,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) {
                        ShAmt);
   SDValue Mask = N->getOperand(1);
   SDValue EVL = N->getOperand(2);
-  return DAG.getNode(ISD::VP_LSHR, dl, NVT,
+  return DAG.getNode(ISD::VP_SRL, dl, NVT,
                      DAG.getNode(ISD::VP_BSWAP, dl, NVT, Op, Mask, EVL), ShAmt,
                      Mask, EVL);
 }
@@ -601,7 +601,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITREVERSE(SDNode *N) {
                        DAG.getNode(ISD::BITREVERSE, dl, NVT, Op), ShAmt);
   SDValue Mask = N->getOperand(1);
   SDValue EVL = N->getOperand(2);
-  return DAG.getNode(ISD::VP_LSHR, dl, NVT,
+  return DAG.getNode(ISD::VP_SRL, dl, NVT,
                      DAG.getNode(ISD::VP_BITREVERSE, dl, NVT, Op, Mask, EVL),
                      ShAmt, Mask, EVL);
 }
@@ -1405,7 +1405,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) {
   SDValue RHS = N->getOperand(1);
   if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
     RHS = ZExtPromotedInteger(RHS);
-  if (N->getOpcode() != ISD::VP_ASHR)
+  if (N->getOpcode() != ISD::VP_SRA)
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
                      N->getOperand(2), N->getOperand(3));
@@ -1417,7 +1417,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) {
   SDValue RHS = N->getOperand(1);
   if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
     RHS = ZExtPromotedInteger(RHS);
-  if (N->getOpcode() != ISD::VP_LSHR)
+  if (N->getOpcode() != ISD::VP_SRL)
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
                      N->getOperand(2), N->getOperand(3));
@@ -1513,10 +1513,10 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VPFunnelShift(SDNode *N) {
     Hi = DAG.getNode(ISD::VP_SHL, DL, VT, Hi, HiShift, Mask, EVL);
     Lo = DAG.getVPZeroExtendInReg(Lo, Mask, EVL, DL, OldVT);
     SDValue Res = DAG.getNode(ISD::VP_OR, DL, VT, Hi, Lo, Mask, EVL);
-    Res = DAG.getNode(IsFSHR ? ISD::VP_LSHR : ISD::VP_SHL, DL, VT, Res, Amt,
+    Res = DAG.getNode(IsFSHR ? ISD::VP_SRL : ISD::VP_SHL, DL, VT, Res, Amt,
                       Mask, EVL);
     if (!IsFSHR)
-      Res = DAG.getNode(ISD::VP_LSHR, DL, VT, Res, HiShift, Mask, EVL);
+      Res = DAG.getNode(ISD::VP_SRL, DL, VT, Res, HiShift, Mask, EVL);
     return Res;
   }
 
@@ -2212,7 +2212,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_VP_SIGN_EXTEND(SDNode *N) {
   // FIXME: There is no VP_SIGN_EXTEND_INREG so use a pair of shifts.
   SDValue Shl = DAG.getNode(ISD::VP_SHL, dl, VT, Op, ShAmt, N->getOperand(1),
                             N->getOperand(2));
-  return DAG.getNode(ISD::VP_ASHR, dl, VT, Shl, ShAmt, N->getOperand(1),
+  return DAG.getNode(ISD::VP_SRA, dl, VT, Shl, ShAmt, N->getOperand(1),
                      N->getOperand(2));
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index ec0513591566..40e621f0db22 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1188,8 +1188,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::OR: case ISD::VP_OR:
   case ISD::XOR: case ISD::VP_XOR:
   case ISD::SHL: case ISD::VP_SHL:
-  case ISD::SRA: case ISD::VP_ASHR:
-  case ISD::SRL: case ISD::VP_LSHR:
+  case ISD::SRA: case ISD::VP_SRA:
+  case ISD::SRL: case ISD::VP_SRL:
   case ISD::UREM: case ISD::VP_UREM:
   case ISD::SREM: case ISD::VP_SREM:
   case ISD::FREM: case ISD::VP_FREM:
@@ -4235,8 +4235,8 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SUB: case ISD::VP_SUB:
   case ISD::XOR: case ISD::VP_XOR:
   case ISD::SHL: case ISD::VP_SHL:
-  case ISD::SRA: case ISD::VP_ASHR:
-  case ISD::SRL: case ISD::VP_LSHR:
+  case ISD::SRA: case ISD::VP_SRA:
+  case ISD::SRL: case ISD::VP_SRL:
   case ISD::FMINNUM: case ISD::VP_FMINNUM:
   case ISD::FMAXNUM: case ISD::VP_FMAXNUM:
   case ISD::FMINIMUM:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 777bbf071732..b05649c6ce95 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4780,6 +4780,13 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
         (VTBits - SignBitsOp0 + 1) + (VTBits - SignBitsOp1 + 1);
     return OutValidBits > VTBits ? 1 : VTBits - OutValidBits + 1;
   }
+  case ISD::AVGCEILS:
+  case ISD::AVGFLOORS:
+    Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    if (Tmp == 1)
+      return 1; // Early out.
+    Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    return std::min(Tmp, Tmp2);
   case ISD::SREM:
     // The sign bit is the LHS's sign bit, except when the result of the
     // remainder is zero. The magnitude of the result should be less than or
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 3ec6b9b79507..be7bcc505bd4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -587,6 +587,10 @@ bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth,
   if (VT.isVector())
     return false;
 
+  assert(Op.getOperand(0).getValueType().getScalarSizeInBits() == BitWidth &&
+         Op.getOperand(1).getValueType().getScalarSizeInBits() == BitWidth &&
+         "ShrinkDemandedOp only supports operands that have the same size!");
+
   // Don't do this if the node has another user, which may require the
   // full value.
   if (!Op.getNode()->hasOneUse())
@@ -1832,11 +1836,33 @@ bool TargetLowering::SimplifyDemandedBits(
         }
       }
 
+      // TODO: Can we merge this fold with the one below?
       // Try shrinking the operation as long as the shift amount will still be
       // in range.
-      if ((ShAmt < DemandedBits.getActiveBits()) &&
-          ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
-        return true;
+      if (ShAmt < DemandedBits.getActiveBits() && !VT.isVector() &&
+          Op.getNode()->hasOneUse()) {
+        // Search for the smallest integer type with free casts to and from
+        // Op's type. For expedience, just check power-of-2 integer types.
+        unsigned DemandedSize = DemandedBits.getActiveBits();
+        for (unsigned SmallVTBits = llvm::bit_ceil(DemandedSize);
+             SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
+          EVT SmallVT = EVT::getIntegerVT(*TLO.DAG.getContext(), SmallVTBits);
+          if (isNarrowingProfitable(VT, SmallVT) &&
+              isTypeDesirableForOp(ISD::SHL, SmallVT) &&
+              isTruncateFree(VT, SmallVT) && isZExtFree(SmallVT, VT) &&
+              (!TLO.LegalOperations() || isOperationLegal(ISD::SHL, SmallVT))) {
+            assert(DemandedSize <= SmallVTBits &&
+                   "Narrowed below demanded bits?");
+            // We found a type with free casts.
+            SDValue NarrowShl = TLO.DAG.getNode(
+                ISD::SHL, dl, SmallVT,
+                TLO.DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(0)),
+                TLO.DAG.getShiftAmountConstant(ShAmt, SmallVT, dl));
+            return TLO.CombineTo(
+                Op, TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT, NarrowShl));
+          }
+        }
+      }
 
       // Narrow shift to lower half - similar to ShrinkDemandedOp.
       // (shl i64:x, K) -> (i64 zero_extend (shl (i32 (trunc i64:x)), K))
@@ -1908,11 +1934,6 @@ bool TargetLowering::SimplifyDemandedBits(
     SDValue Op1 = Op.getOperand(1);
     EVT ShiftVT = Op1.getValueType();
 
-    // Try to match AVG patterns.
-    if (SDValue AVG = combineShiftToAVG(Op, TLO.DAG, *this, DemandedBits,
-                                        DemandedElts, Depth + 1))
-      return TLO.CombineTo(Op, AVG);
-
     KnownBits KnownSA = TLO.DAG.computeKnownBits(Op1, DemandedElts, Depth + 1);
     if (KnownSA.isConstant() && KnownSA.getConstant().ult(BitWidth)) {
       unsigned ShAmt = KnownSA.getConstant().getZExtValue();
@@ -1994,6 +2015,12 @@ bool TargetLowering::SimplifyDemandedBits(
       // shift amounts.
       Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
     }
+
+    // Try to match AVG patterns (after shift simplification).
+    if (SDValue AVG = combineShiftToAVG(Op, TLO.DAG, *this, DemandedBits,
+                                        DemandedElts, Depth + 1))
+      return TLO.CombineTo(Op, AVG);
+
     break;
   }
   case ISD::SRA: {
@@ -2015,11 +2042,6 @@ bool TargetLowering::SimplifyDemandedBits(
     if (DemandedBits.isOne())
       return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1));
 
-    // Try to match AVG patterns.
-    if (SDValue AVG = combineShiftToAVG(Op, TLO.DAG, *this, DemandedBits,
-                                        DemandedElts, Depth + 1))
-      return TLO.CombineTo(Op, AVG);
-
     KnownBits KnownSA = TLO.DAG.computeKnownBits(Op1, DemandedElts, Depth + 1);
     if (KnownSA.isConstant() && KnownSA.getConstant().ult(BitWidth)) {
       unsigned ShAmt = KnownSA.getConstant().getZExtValue();
@@ -2106,6 +2128,12 @@ bool TargetLowering::SimplifyDemandedBits(
         }
       }
     }
+
+    // Try to match AVG patterns (after shift simplification).
+    if (SDValue AVG = combineShiftToAVG(Op, TLO.DAG, *this, DemandedBits,
+                                        DemandedElts, Depth + 1))
+      return TLO.CombineTo(Op, AVG);
+
     break;
   }
   case ISD::FSHL:
@@ -2786,10 +2814,16 @@ bool TargetLowering::SimplifyDemandedBits(
     unsigned DemandedBitsLZ = DemandedBits.countl_zero();
     APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
     KnownBits KnownOp0, KnownOp1;
-    if (SimplifyDemandedBits(Op0, LoMask, DemandedElts, KnownOp0, TLO,
-                             Depth + 1) ||
-        SimplifyDemandedBits(Op1, LoMask, DemandedElts, KnownOp1, TLO,
+    auto GetDemandedBitsLHSMask = [&](APInt Demanded,
+                                      const KnownBits &KnownRHS) {
+      if (Op.getOpcode() == ISD::MUL)
+        Demanded.clearHighBits(KnownRHS.countMinTrailingZeros());
+      return Demanded;
+    };
+    if (SimplifyDemandedBits(Op1, LoMask, DemandedElts, KnownOp1, TLO,
                              Depth + 1) ||
+        SimplifyDemandedBits(Op0, GetDemandedBitsLHSMask(LoMask, KnownOp1),
+                             DemandedElts, KnownOp0, TLO, Depth + 1) ||
         // See if the operation should be performed at a smaller bit width.
         ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO)) {
       if (Flags.hasNoSignedWrap() || Flags.hasNoUnsignedWrap()) {
@@ -7855,7 +7889,7 @@ static SDValue expandVPFunnelShift(SDNode *Node, SelectionDAG &DAG) {
     InvShAmt = DAG.getNode(ISD::VP_SUB, DL, ShVT, BitWidthC, ShAmt, Mask, VL);
     ShX = DAG.getNode(ISD::VP_SHL, DL, VT, X, IsFSHL ? ShAmt : InvShAmt, Mask,
                       VL);
-    ShY = DAG.getNode(ISD::VP_LSHR, DL, VT, Y, IsFSHL ? InvShAmt : ShAmt, Mask,
+    ShY = DAG.getNode(ISD::VP_SRL, DL, VT, Y, IsFSHL ? InvShAmt : ShAmt, Mask,
                       VL);
   } else {
     // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
@@ -7877,12 +7911,12 @@ static SDValue expandVPFunnelShift(SDNode *Node, SelectionDAG &DAG) {
     SDValue One = DAG.getConstant(1, DL, ShVT);
     if (IsFSHL) {
       ShX = DAG.getNode(ISD::VP_SHL, DL, VT, X, ShAmt, Mask, VL);
-      SDValue ShY1 = DAG.getNode(ISD::VP_LSHR, DL, VT, Y, One, Mask, VL);
-      ShY = DAG.getNode(ISD::VP_LSHR, DL, VT, ShY1, InvShAmt, Mask, VL);
+      SDValue ShY1 = DAG.getNode(ISD::VP_SRL, DL, VT, Y, One, Mask, VL);
+      ShY = DAG.getNode(ISD::VP_SRL, DL, VT, ShY1, InvShAmt, Mask, VL);
     } else {
       SDValue ShX1 = DAG.getNode(ISD::VP_SHL, DL, VT, X, One, Mask, VL);
       ShX = DAG.getNode(ISD::VP_SHL, DL, VT, ShX1, InvShAmt, Mask, VL);
-      ShY = DAG.getNode(ISD::VP_LSHR, DL, VT, Y, ShAmt, Mask, VL);
+      ShY = DAG.getNode(ISD::VP_SRL, DL, VT, Y, ShAmt, Mask, VL);
     }
   }
   return DAG.getNode(ISD::VP_OR, DL, VT, ShX, ShY, Mask, VL);
@@ -8849,7 +8883,7 @@ SDValue TargetLowering::expandVPCTPOP(SDNode *Node, SelectionDAG &DAG) const {
 
   // v = v - ((v >> 1) & 0x55555555...)
   Tmp1 = DAG.getNode(ISD::VP_AND, dl, VT,
-                     DAG.getNode(ISD::VP_LSHR, dl, VT, Op,
+                     DAG.getNode(ISD::VP_SRL, dl, VT, Op,
                                  DAG.getConstant(1, dl, ShVT), Mask, VL),
                      Mask55, Mask, VL);
   Op = DAG.getNode(ISD::VP_SUB, dl, VT, Op, Tmp1, Mask, VL);
@@ -8857,13 +8891,13 @@ SDValue TargetLowering::expandVPCTPOP(SDNode *Node, SelectionDAG &DAG) const {
   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
   Tmp2 = DAG.getNode(ISD::VP_AND, dl, VT, Op, Mask33, Mask, VL);
   Tmp3 = DAG.getNode(ISD::VP_AND, dl, VT,
-                     DAG.getNode(ISD::VP_LSHR, dl, VT, Op,
+                     DAG.getNode(ISD::VP_SRL, dl, VT, Op,
                                  DAG.getConstant(2, dl, ShVT), Mask, VL),
                      Mask33, Mask, VL);
   Op = DAG.getNode(ISD::VP_ADD, dl, VT, Tmp2, Tmp3, Mask, VL);
 
   // v = (v + (v >> 4)) & 0x0F0F0F0F...
-  Tmp4 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(4, dl, ShVT),
+  Tmp4 = DAG.getNode(ISD::VP_SRL, dl, VT, Op, DAG.getConstant(4, dl, ShVT),
                      Mask, VL),
   Tmp5 = DAG.getNode(ISD::VP_ADD, dl, VT, Op, Tmp4, Mask, VL);
   Op = DAG.getNode(ISD::VP_AND, dl, VT, Tmp5, Mask0F, Mask, VL);
@@ -8887,8 +8921,8 @@ SDValue TargetLowering::expandVPCTPOP(SDNode *Node, SelectionDAG &DAG) const {
                       Mask, VL);
     }
   }
-  return DAG.getNode(ISD::VP_LSHR, dl, VT, V,
-                     DAG.getConstant(Len - 8, dl, ShVT), Mask, VL);
+  return DAG.getNode(ISD::VP_SRL, dl, VT, V, DAG.getConstant(Len - 8, dl, ShVT),
+                     Mask, VL);
 }
 
 SDValue TargetLowering::expandCTLZ(SDNode *Node, SelectionDAG &DAG) const {
@@ -8960,7 +8994,7 @@ SDValue TargetLowering::expandVPCTLZ(SDNode *Node, SelectionDAG &DAG) const {
   for (unsigned i = 0; (1U << i) < NumBitsPerElt; ++i) {
     SDValue Tmp = DAG.getConstant(1ULL << i, dl, ShVT);
     Op = DAG.getNode(ISD::VP_OR, dl, VT, Op,
-                     DAG.getNode(ISD::VP_LSHR, dl, VT, Op, Tmp, Mask, VL), Mask,
+                     DAG.getNode(ISD::VP_SRL, dl, VT, Op, Tmp, Mask, VL), Mask,
                      VL);
   }
   Op = DAG.getNode(ISD::VP_XOR, dl, VT, Op, DAG.getConstant(-1, dl, VT), Mask,
@@ -9194,11 +9228,21 @@ SDValue TargetLowering::expandABD(SDNode *N, SelectionDAG &DAG) const {
                        DAG.getNode(ISD::USUBSAT, dl, VT, LHS, RHS),
                        DAG.getNode(ISD::USUBSAT, dl, VT, RHS, LHS));
 
-  // abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
-  // abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
   EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT;
   SDValue Cmp = DAG.getSetCC(dl, CCVT, LHS, RHS, CC);
+
+  // Branchless expansion iff cmp result is allbits:
+  // abds(lhs, rhs) -> sub(sgt(lhs, rhs), xor(sgt(lhs, rhs), sub(lhs, rhs)))
+  // abdu(lhs, rhs) -> sub(ugt(lhs, rhs), xor(ugt(lhs, rhs), sub(lhs, rhs)))
+  if (CCVT == VT && getBooleanContents(VT) == ZeroOrNegativeOneBooleanContent) {
+    SDValue Diff = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
+    SDValue Xor = DAG.getNode(ISD::XOR, dl, VT, Diff, Cmp);
+    return DAG.getNode(ISD::SUB, dl, VT, Cmp, Xor);
+  }
+
+  // abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
+  // abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
   return DAG.getSelect(dl, VT, Cmp, DAG.getNode(ISD::SUB, dl, VT, LHS, RHS),
                        DAG.getNode(ISD::SUB, dl, VT, RHS, LHS));
 }
@@ -9279,7 +9323,7 @@ SDValue TargetLowering::expandVPBSWAP(SDNode *N, SelectionDAG &DAG) const {
   case MVT::i16:
     Tmp1 = DAG.getNode(ISD::VP_SHL, dl, VT, Op, DAG.getConstant(8, dl, SHVT),
                        Mask, EVL);
-    Tmp2 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(8, dl, SHVT),
+    Tmp2 = DAG.getNode(ISD::VP_SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT),
                        Mask, EVL);
     return DAG.getNode(ISD::VP_OR, dl, VT, Tmp1, Tmp2, Mask, EVL);
   case MVT::i32:
@@ -9289,11 +9333,11 @@ SDValue TargetLowering::expandVPBSWAP(SDNode *N, SelectionDAG &DAG) const {
                        Mask, EVL);
     Tmp3 = DAG.getNode(ISD::VP_SHL, dl, VT, Tmp3, DAG.getConstant(8, dl, SHVT),
                        Mask, EVL);
-    Tmp2 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(8, dl, SHVT),
+    Tmp2 = DAG.getNode(ISD::VP_SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT),
                        Mask, EVL);
     Tmp2 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp2,
                        DAG.getConstant(0xFF00, dl, VT), Mask, EVL);
-    Tmp1 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(24, dl, SHVT),
+    Tmp1 = DAG.getNode(ISD::VP_SRL, dl, VT, Op, DAG.getConstant(24, dl, SHVT),
                        Mask, EVL);
     Tmp4 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp4, Tmp3, Mask, EVL);
     Tmp2 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp2, Tmp1, Mask, EVL);
@@ -9313,19 +9357,19 @@ SDValue TargetLowering::expandVPBSWAP(SDNode *N, SelectionDAG &DAG) const {
                        DAG.getConstant(255ULL << 24, dl, VT), Mask, EVL);
     Tmp5 = DAG.getNode(ISD::VP_SHL, dl, VT, Tmp5, DAG.getConstant(8, dl, SHVT),
                        Mask, EVL);
-    Tmp4 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(8, dl, SHVT),
+    Tmp4 = DAG.getNode(ISD::VP_SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT),
                        Mask, EVL);
     Tmp4 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp4,
                        DAG.getConstant(255ULL << 24, dl, VT), Mask, EVL);
-    Tmp3 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(24, dl, SHVT),
+    Tmp3 = DAG.getNode(ISD::VP_SRL, dl, VT, Op, DAG.getConstant(24, dl, SHVT),
                        Mask, EVL);
     Tmp3 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp3,
                        DAG.getConstant(255ULL << 16, dl, VT), Mask, EVL);
-    Tmp2 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(40, dl, SHVT),
+    Tmp2 = DAG.getNode(ISD::VP_SRL, dl, VT, Op, DAG.getConstant(40, dl, SHVT),
                        Mask, EVL);
     Tmp2 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp2,
                        DAG.getConstant(255ULL << 8, dl, VT), Mask, EVL);
-    Tmp1 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(56, dl, SHVT),
+    Tmp1 = DAG.getNode(ISD::VP_SRL, dl, VT, Op, DAG.getConstant(56, dl, SHVT),
                        Mask, EVL);
     Tmp8 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp8, Tmp7, Mask, EVL);
     Tmp6 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp6, Tmp5, Mask, EVL);
@@ -9424,7 +9468,7 @@ SDValue TargetLowering::expandVPBITREVERSE(SDNode *N, SelectionDAG &DAG) const {
     Tmp = (Sz > 8 ? DAG.getNode(ISD::VP_BSWAP, dl, VT, Op, Mask, EVL) : Op);
 
     // swap i4: ((V >> 4) & 0x0F) | ((V & 0x0F) << 4)
-    Tmp2 = DAG.getNode(ISD::VP_LSHR, dl, VT, Tmp, DAG.getConstant(4, dl, SHVT),
+    Tmp2 = DAG.getNode(ISD::VP_SRL, dl, VT, Tmp, DAG.getConstant(4, dl, SHVT),
                        Mask, EVL);
     Tmp2 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp2,
                        DAG.getConstant(Mask4, dl, VT), Mask, EVL);
@@ -9435,7 +9479,7 @@ SDValue TargetLowering::expandVPBITREVERSE(SDNode *N, SelectionDAG &DAG) const {
     Tmp = DAG.getNode(ISD::VP_OR, dl, VT, Tmp2, Tmp3, Mask, EVL);
 
     // swap i2: ((V >> 2) & 0x33) | ((V & 0x33) << 2)
-    Tmp2 = DAG.getNode(ISD::VP_LSHR, dl, VT, Tmp, DAG.getConstant(2, dl, SHVT),
+    Tmp2 = DAG.getNode(ISD::VP_SRL, dl, VT, Tmp, DAG.getConstant(2, dl, SHVT),
                        Mask, EVL);
     Tmp2 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp2,
                        DAG.getConstant(Mask2, dl, VT), Mask, EVL);
@@ -9446,7 +9490,7 @@ SDValue TargetLowering::expandVPBITREVERSE(SDNode *N, SelectionDAG &DAG) const {
     Tmp = DAG.getNode(ISD::VP_OR, dl, VT, Tmp2, Tmp3, Mask, EVL);
 
     // swap i1: ((V >> 1) & 0x55) | ((V & 0x55) << 1)
-    Tmp2 = DAG.getNode(ISD::VP_LSHR, dl, VT, Tmp, DAG.getConstant(1, dl, SHVT),
+    Tmp2 = DAG.getNode(ISD::VP_SRL, dl, VT, Tmp, DAG.getConstant(1, dl, SHVT),
                        Mask, EVL);
     Tmp2 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp2,
                        DAG.getConstant(Mask1, dl, VT), Mask, EVL);
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 3e1897ce670a..0fc915d89f6c 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -523,6 +523,8 @@ static unsigned getELFSectionType(StringRef Name, SectionKind K) {
 
   if (hasPrefix(Name, ".llvm.offloading"))
     return ELF::SHT_LLVM_OFFLOADING;
+  if (Name == ".llvm.lto")
+    return ELF::SHT_LLVM_LTO;
 
   if (K.isBSS() || K.isThreadBSS())
     return ELF::SHT_NOBITS;
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index 58db686ec7d5..3d5c58d282da 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -579,9 +579,11 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   // clang-format on
 }
 
-/// Return the value type corresponding to the specified type.  This returns all
-/// pointers as MVT::iPTR.  If HandleUnknown is true, unknown types are returned
-/// as Other, otherwise they are invalid.
+/// Return the value type corresponding to the specified type.
+/// If HandleUnknown is true, unknown types are returned as Other, otherwise
+/// they are invalid.
+/// NB: This includes pointer types, which require a DataLayout to convert
+/// to a concrete value type.
 MVT MVT::getVT(Type *Ty, bool HandleUnknown){
   assert(Ty != nullptr && "Invalid type");
   switch (Ty->getTypeID()) {
@@ -611,7 +613,6 @@ MVT MVT::getVT(Type *Ty, bool HandleUnknown){
   case Type::X86_AMXTyID:   return MVT(MVT::x86amx);
   case Type::FP128TyID:     return MVT(MVT::f128);
   case Type::PPC_FP128TyID: return MVT(MVT::ppcf128);
-  case Type::PointerTyID:   return MVT(MVT::iPTR);
   case Type::FixedVectorTyID:
   case Type::ScalableVectorTyID: {
     VectorType *VTy = cast<VectorType>(Ty);
@@ -622,9 +623,11 @@ MVT MVT::getVT(Type *Ty, bool HandleUnknown){
   }
 }
 
-/// getEVT - Return the value type corresponding to the specified type.  This
-/// returns all pointers as MVT::iPTR.  If HandleUnknown is true, unknown types
-/// are returned as Other, otherwise they are invalid.
+/// getEVT - Return the value type corresponding to the specified type.
+/// If HandleUnknown is true, unknown types are returned as Other, otherwise
+/// they are invalid.
+/// NB: This includes pointer types, which require a DataLayout to convert
+/// to a concrete value type.
 EVT EVT::getEVT(Type *Ty, bool HandleUnknown){
   switch (Ty->getTypeID()) {
   default:
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
index 4da031716e32..3cdffb8cd061 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
@@ -75,7 +75,7 @@ Error SimpleExecutorMemoryManager::finalize(tpctypes::FinalizeRequest &FR) {
   auto BailOut = [&](Error Err) {
     std::pair<void *, Allocation> AllocToDestroy;
 
-    // Get allocation to destory.
+    // Get allocation to destroy.
     {
       std::lock_guard<std::mutex> Lock(M);
       auto I = Allocations.find(Base.toPtr<void *>());
@@ -153,7 +153,7 @@ Error SimpleExecutorMemoryManager::deallocate(
   std::vector<std::pair<void *, Allocation>> AllocPairs;
   AllocPairs.reserve(Bases.size());
 
-  // Get allocation to destory.
+  // Get allocation to destroy.
   Error Err = Error::success();
   {
     std::lock_guard<std::mutex> Lock(M);
diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index 59e7a9f5eb11..c3bde48b982c 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -930,6 +930,8 @@ ConstantRange ConstantRange::overflowingBinaryOp(Instruction::BinaryOps BinOp,
     return addWithNoWrap(Other, NoWrapKind);
   case Instruction::Sub:
     return subWithNoWrap(Other, NoWrapKind);
+  case Instruction::Mul:
+    return multiplyWithNoWrap(Other, NoWrapKind);
   default:
     // Don't know about this Overflowing Binary Operation.
     // Conservatively fallback to plain binop handling.
@@ -1167,6 +1169,26 @@ ConstantRange::multiply(const ConstantRange &Other) const {
   return UR.isSizeStrictlySmallerThan(SR) ? UR : SR;
 }
 
+ConstantRange
+ConstantRange::multiplyWithNoWrap(const ConstantRange &Other,
+                                  unsigned NoWrapKind,
+                                  PreferredRangeType RangeType) const {
+  if (isEmptySet() || Other.isEmptySet())
+    return getEmpty();
+  if (isFullSet() && Other.isFullSet())
+    return getFull();
+
+  ConstantRange Result = multiply(Other);
+
+  if (NoWrapKind & OverflowingBinaryOperator::NoSignedWrap)
+    Result = Result.intersectWith(smul_sat(Other), RangeType);
+
+  if (NoWrapKind & OverflowingBinaryOperator::NoUnsignedWrap)
+    Result = Result.intersectWith(umul_sat(Other), RangeType);
+
+  return Result;
+}
+
 ConstantRange ConstantRange::smul_fast(const ConstantRange &Other) const {
   if (isEmptySet() || Other.isEmptySet())
     return getEmpty();
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 0d6760ed0841..b32799355d69 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -43,8 +43,8 @@ using namespace llvm;
 GlobalVariable *IRBuilderBase::CreateGlobalString(StringRef Str,
                                                   const Twine &Name,
                                                   unsigned AddressSpace,
-                                                  Module *M) {
-  Constant *StrConstant = ConstantDataArray::getString(Context, Str);
+                                                  Module *M, bool AddNull) {
+  Constant *StrConstant = ConstantDataArray::getString(Context, Str, AddNull);
   if (!M)
     M = BB->getParent()->getParent();
   auto *GV = new GlobalVariable(
diff --git a/llvm/lib/IR/MDBuilder.cpp b/llvm/lib/IR/MDBuilder.cpp
index 0bf41d7cc7c2..bd68db3a6f96 100644
--- a/llvm/lib/IR/MDBuilder.cpp
+++ b/llvm/lib/IR/MDBuilder.cpp
@@ -86,9 +86,8 @@ MDNode *MDBuilder::createFunctionEntryCount(
 }
 
 MDNode *MDBuilder::createFunctionSectionPrefix(StringRef Prefix) {
-  return MDNode::get(Context,
-                     {createString("function_section_prefix"),
-                      createString(Prefix)});
+  return MDNode::get(
+      Context, {createString("function_section_prefix"), createString(Prefix)});
 }
 
 MDNode *MDBuilder::createRange(const APInt &Lo, const APInt &Hi) {
@@ -148,9 +147,10 @@ MDNode *MDBuilder::mergeCallbackEncodings(MDNode *ExistingCallbacks,
   for (unsigned u = 0; u < NumExistingOps; u++) {
     Ops[u] = ExistingCallbacks->getOperand(u);
 
-    auto *OldCBCalleeIdxAsCM = cast<ConstantAsMetadata>(Ops[u]);
+    auto *OldCBCalleeIdxAsCM =
+        cast<ConstantAsMetadata>(cast<MDNode>(Ops[u])->getOperand(0));
     uint64_t OldCBCalleeIdx =
-      cast<ConstantInt>(OldCBCalleeIdxAsCM->getValue())->getZExtValue();
+        cast<ConstantInt>(OldCBCalleeIdxAsCM->getValue())->getZExtValue();
     (void)OldCBCalleeIdx;
     assert(NewCBCalleeIdx != OldCBCalleeIdx &&
            "Cannot map a callback callee index twice!");
@@ -339,8 +339,8 @@ MDNode *MDBuilder::createMutableTBAAAccessTag(MDNode *Tag) {
 
 MDNode *MDBuilder::createIrrLoopHeaderWeight(uint64_t Weight) {
   Metadata *Vals[] = {
-    createString("loop_header_weight"),
-    createConstant(ConstantInt::get(Type::getInt64Ty(Context), Weight)),
+      createString("loop_header_weight"),
+      createConstant(ConstantInt::get(Type::getInt64Ty(Context), Weight)),
   };
   return MDNode::get(Context, Vals);
 }
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index a8696ed9e3ce..f97dd18c736c 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -882,7 +882,7 @@ StringRef Module::getDarwinTargetVariantTriple() const {
 }
 
 void Module::setDarwinTargetVariantTriple(StringRef T) {
-  addModuleFlag(ModFlagBehavior::Override, "darwin.target_variant.triple",
+  addModuleFlag(ModFlagBehavior::Warning, "darwin.target_variant.triple",
                 MDString::get(getContext(), T));
 }
 
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index e2754d74979e..7304eab738ce 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1400,18 +1400,20 @@ public:
                   llvm::StringRef ModulePath,
                   const std::string &NewModulePath) {
     std::map<std::string, GVSummaryMapTy> ModuleToSummariesForIndex;
+    GVSummaryPtrSet DeclarationSummaries;
 
     std::error_code EC;
     gatherImportedSummariesForModule(ModulePath, ModuleToDefinedGVSummaries,
-                                     ImportList, ModuleToSummariesForIndex);
+                                     ImportList, ModuleToSummariesForIndex,
+                                     DeclarationSummaries);
 
     raw_fd_ostream OS(NewModulePath + ".thinlto.bc", EC,
                       sys::fs::OpenFlags::OF_None);
     if (EC)
       return errorCodeToError(EC);
 
-    // TODO: Serialize declaration bits to bitcode.
-    writeIndexToFile(CombinedIndex, OS, &ModuleToSummariesForIndex);
+    writeIndexToFile(CombinedIndex, OS, &ModuleToSummariesForIndex,
+                     &DeclarationSummaries);
 
     if (ShouldEmitImportsFiles) {
       EC = EmitImportsFiles(ModulePath, NewModulePath + ".imports",
diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index 8f517eb50dc7..b054b42b6377 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -766,7 +766,7 @@ void ThinLTOCodeGenerator::crossModuleImport(Module &TheModule,
 void ThinLTOCodeGenerator::gatherImportedSummariesForModule(
     Module &TheModule, ModuleSummaryIndex &Index,
     std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex,
-    const lto::InputFile &File) {
+    GVSummaryPtrSet &DecSummaries, const lto::InputFile &File) {
   auto ModuleCount = Index.modulePaths().size();
   auto ModuleIdentifier = TheModule.getModuleIdentifier();
 
@@ -796,7 +796,7 @@ void ThinLTOCodeGenerator::gatherImportedSummariesForModule(
 
   llvm::gatherImportedSummariesForModule(
       ModuleIdentifier, ModuleToDefinedGVSummaries,
-      ImportLists[ModuleIdentifier], ModuleToSummariesForIndex);
+      ImportLists[ModuleIdentifier], ModuleToSummariesForIndex, DecSummaries);
 }
 
 /**
@@ -832,10 +832,14 @@ void ThinLTOCodeGenerator::emitImports(Module &TheModule, StringRef OutputName,
                            IsPrevailing(PrevailingCopy), ImportLists,
                            ExportLists);
 
+  // 'EmitImportsFiles' emits the list of modules from which to import from, and
+  // the set of keys in `ModuleToSummariesForIndex` should be a superset of keys
+  // in `DecSummaries`, so no need to use `DecSummaries` in `EmitImportFiles`.
+  GVSummaryPtrSet DecSummaries;
   std::map<std::string, GVSummaryMapTy> ModuleToSummariesForIndex;
   llvm::gatherImportedSummariesForModule(
       ModuleIdentifier, ModuleToDefinedGVSummaries,
-      ImportLists[ModuleIdentifier], ModuleToSummariesForIndex);
+      ImportLists[ModuleIdentifier], ModuleToSummariesForIndex, DecSummaries);
 
   std::error_code EC;
   if ((EC = EmitImportsFiles(ModuleIdentifier, OutputName,
diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp
index bcf065c56691..d5cbdc5de0b8 100644
--- a/llvm/lib/MCA/InstrBuilder.cpp
+++ b/llvm/lib/MCA/InstrBuilder.cpp
@@ -31,9 +31,9 @@ InstrBuilder::InstrBuilder(const llvm::MCSubtargetInfo &sti,
                            const llvm::MCInstrInfo &mcii,
                            const llvm::MCRegisterInfo &mri,
                            const llvm::MCInstrAnalysis *mcia,
-                           const mca::InstrumentManager &im)
+                           const mca::InstrumentManager &im, unsigned cl)
     : STI(sti), MCII(mcii), MRI(mri), MCIA(mcia), IM(im), FirstCallInst(true),
-      FirstReturnInst(true) {
+      FirstReturnInst(true), CallLatency(cl) {
   const MCSchedModel &SM = STI.getSchedModel();
   ProcResourceMasks.resize(SM.getNumProcResourceKinds());
   computeProcResourceMasks(STI.getSchedModel(), ProcResourceMasks);
@@ -220,17 +220,19 @@ static void initializeUsedResources(InstrDesc &ID,
 
 static void computeMaxLatency(InstrDesc &ID, const MCInstrDesc &MCDesc,
                               const MCSchedClassDesc &SCDesc,
-                              const MCSubtargetInfo &STI) {
+                              const MCSubtargetInfo &STI,
+                              unsigned CallLatency) {
   if (MCDesc.isCall()) {
     // We cannot estimate how long this call will take.
-    // Artificially set an arbitrarily high latency (100cy).
-    ID.MaxLatency = 100U;
+    // Artificially set an arbitrarily high latency.
+    ID.MaxLatency = CallLatency;
     return;
   }
 
   int Latency = MCSchedModel::computeInstrLatency(STI, SCDesc);
-  // If latency is unknown, then conservatively assume a MaxLatency of 100cy.
-  ID.MaxLatency = Latency < 0 ? 100U : static_cast<unsigned>(Latency);
+  // If latency is unknown, then conservatively assume the MaxLatency set for
+  // calls.
+  ID.MaxLatency = Latency < 0 ? CallLatency : static_cast<unsigned>(Latency);
 }
 
 static Error verifyOperands(const MCInstrDesc &MCDesc, const MCInst &MCI) {
@@ -568,7 +570,7 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI,
     // We don't correctly model calls.
     WithColor::warning() << "found a call in the input assembly sequence.\n";
     WithColor::note() << "call instructions are not correctly modeled. "
-                      << "Assume a latency of 100cy.\n";
+                      << "Assume a latency of " << CallLatency << "cy.\n";
     FirstCallInst = false;
   }
 
@@ -580,7 +582,7 @@ InstrBuilder::createInstrDescImpl(const MCInst &MCI,
   }
 
   initializeUsedResources(*ID, SCDesc, STI, ProcResourceMasks);
-  computeMaxLatency(*ID, MCDesc, SCDesc, STI);
+  computeMaxLatency(*ID, MCDesc, SCDesc, STI, CallLatency);
 
   if (Error Err = verifyOperands(MCDesc, MCI))
     return std::move(Err);
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 806d01de1ada..f9cd71b37002 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -1002,46 +1002,60 @@ void InstrProfRecord::addValueData(uint32_t ValueKind, uint32_t Site,
     ValueSites.emplace_back(VData, VData + N);
 }
 
-std::vector<BPFunctionNode> TemporalProfTraceTy::createBPFunctionNodes(
-    ArrayRef<TemporalProfTraceTy> Traces) {
+void TemporalProfTraceTy::createBPFunctionNodes(
+    ArrayRef<TemporalProfTraceTy> Traces, std::vector<BPFunctionNode> &Nodes,
+    bool RemoveOutlierUNs) {
   using IDT = BPFunctionNode::IDT;
   using UtilityNodeT = BPFunctionNode::UtilityNodeT;
-  // Collect all function IDs ordered by their smallest timestamp. This will be
-  // used as the initial FunctionNode order.
-  SetVector<IDT> FunctionIds;
-  size_t LargestTraceSize = 0;
-  for (auto &Trace : Traces)
-    LargestTraceSize =
-        std::max(LargestTraceSize, Trace.FunctionNameRefs.size());
-  for (size_t Timestamp = 0; Timestamp < LargestTraceSize; Timestamp++)
-    for (auto &Trace : Traces)
-      if (Timestamp < Trace.FunctionNameRefs.size())
-        FunctionIds.insert(Trace.FunctionNameRefs[Timestamp]);
-
-  const int N = Log2_64(LargestTraceSize) + 1;
-
+  UtilityNodeT MaxUN = 0;
+  DenseMap<IDT, size_t> IdToFirstTimestamp;
+  DenseMap<IDT, UtilityNodeT> IdToFirstUN;
+  DenseMap<IDT, SmallVector<UtilityNodeT>> IdToUNs;
   // TODO: We need to use the Trace.Weight field to give more weight to more
   // important utilities
-  DenseMap<IDT, SmallVector<UtilityNodeT, 4>> FuncGroups;
-  for (size_t TraceIdx = 0; TraceIdx < Traces.size(); TraceIdx++) {
-    auto &Trace = Traces[TraceIdx].FunctionNameRefs;
-    for (size_t Timestamp = 0; Timestamp < Trace.size(); Timestamp++) {
-      for (int I = Log2_64(Timestamp + 1); I < N; I++) {
-        auto FunctionId = Trace[Timestamp];
-        UtilityNodeT GroupId = TraceIdx * N + I;
-        FuncGroups[FunctionId].push_back(GroupId);
+  for (auto &Trace : Traces) {
+    size_t CutoffTimestamp = 1;
+    for (size_t Timestamp = 0; Timestamp < Trace.FunctionNameRefs.size();
+         Timestamp++) {
+      IDT Id = Trace.FunctionNameRefs[Timestamp];
+      auto [It, WasInserted] = IdToFirstTimestamp.try_emplace(Id, Timestamp);
+      if (!WasInserted)
+        It->getSecond() = std::min<size_t>(It->getSecond(), Timestamp);
+      if (Timestamp >= CutoffTimestamp) {
+        ++MaxUN;
+        CutoffTimestamp = 2 * Timestamp;
       }
+      IdToFirstUN.try_emplace(Id, MaxUN);
     }
+    for (auto &[Id, FirstUN] : IdToFirstUN)
+      for (auto UN = FirstUN; UN <= MaxUN; ++UN)
+        IdToUNs[Id].push_back(UN);
+    ++MaxUN;
+    IdToFirstUN.clear();
   }
 
-  std::vector<BPFunctionNode> Nodes;
-  for (auto Id : FunctionIds) {
-    auto &UNs = FuncGroups[Id];
-    llvm::sort(UNs);
-    UNs.erase(std::unique(UNs.begin(), UNs.end()), UNs.end());
-    Nodes.emplace_back(Id, UNs);
+  if (RemoveOutlierUNs) {
+    DenseMap<UtilityNodeT, unsigned> UNFrequency;
+    for (auto &[Id, UNs] : IdToUNs)
+      for (auto &UN : UNs)
+        ++UNFrequency[UN];
+    // Filter out utility nodes that are too infrequent or too prevalent to make
+    // BalancedPartitioning more effective.
+    for (auto &[Id, UNs] : IdToUNs)
+      llvm::erase_if(UNs, [&](auto &UN) {
+        return UNFrequency[UN] <= 1 || 2 * UNFrequency[UN] > IdToUNs.size();
+      });
   }
-  return Nodes;
+
+  for (auto &[Id, UNs] : IdToUNs)
+    Nodes.emplace_back(Id, UNs);
+
+  // Since BalancedPartitioning is sensitive to the initial order, we explicitly
+  // order nodes by their earliest timestamp.
+  llvm::sort(Nodes, [&](auto &L, auto &R) {
+    return std::make_pair(IdToFirstTimestamp[L.Id], L.Id) <
+           std::make_pair(IdToFirstTimestamp[R.Id], R.Id);
+  });
 }
 
 #define INSTR_PROF_COMMON_API_IMPL
@@ -1620,13 +1634,12 @@ inline size_t constexpr offsetOf(T1 T2::*Member) {
   return size_t(&(Object.*Member)) - size_t(&Object);
 }
 
+// Read a uint64_t from the specified buffer offset, and swap the bytes in
+// native endianness if necessary.
 static inline uint64_t read(const unsigned char *Buffer, size_t Offset) {
-  return *reinterpret_cast<const uint64_t *>(Buffer + Offset);
-}
-
-uint64_t Header::formatVersion() const {
-  using namespace support;
-  return endian::byte_swap<uint64_t, llvm::endianness::little>(Version);
+  using namespace ::support;
+  return endian::read<uint64_t, llvm::endianness::little, unaligned>(Buffer +
+                                                                     Offset);
 }
 
 Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
@@ -1638,18 +1651,15 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
 
   H.Magic = read(Buffer, offsetOf(&Header::Magic));
   // Check the magic number.
-  uint64_t Magic =
-      endian::byte_swap<uint64_t, llvm::endianness::little>(H.Magic);
-  if (Magic != IndexedInstrProf::Magic)
+  if (H.Magic != IndexedInstrProf::Magic)
     return make_error<InstrProfError>(instrprof_error::bad_magic);
 
   // Read the version.
   H.Version = read(Buffer, offsetOf(&Header::Version));
-  if (GET_VERSION(H.formatVersion()) >
-      IndexedInstrProf::ProfVersion::CurrentVersion)
+  if (GET_VERSION(H.Version) > IndexedInstrProf::ProfVersion::CurrentVersion)
     return make_error<InstrProfError>(instrprof_error::unsupported_version);
 
-  switch (GET_VERSION(H.formatVersion())) {
+  switch (GET_VERSION(H.Version)) {
     // When a new field is added in the header add a case statement here to
     // populate it.
     static_assert(
@@ -1680,7 +1690,7 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
 }
 
 size_t Header::size() const {
-  switch (GET_VERSION(formatVersion())) {
+  switch (GET_VERSION(Version)) {
     // When a new field is added to the header add a case statement here to
     // compute the size as offset of the new field + size of the new field. This
     // relies on the field being added to the end of the list.
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index ba21e01abfba..836206a4fd86 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -1212,7 +1212,6 @@ Error IndexedMemProfReader::deserialize(const unsigned char *Start,
   const uint64_t FirstWord =
       support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
 
-  memprof::IndexedVersion Version = memprof::Version0;
   if (FirstWord == memprof::Version1 || FirstWord == memprof::Version2) {
     // Everything is good.  We can proceed to deserialize the rest.
     Version = static_cast<memprof::IndexedVersion>(FirstWord);
@@ -1311,43 +1310,33 @@ Error IndexedInstrProfReader::readHeader() {
   const IndexedInstrProf::Header *Header = &HeaderOr.get();
   Cur += Header->size();
 
-  Cur = readSummary((IndexedInstrProf::ProfVersion)Header->formatVersion(), Cur,
+  Cur = readSummary((IndexedInstrProf::ProfVersion)Header->Version, Cur,
                     /* UseCS */ false);
-  if (Header->formatVersion() & VARIANT_MASK_CSIR_PROF)
-    Cur =
-        readSummary((IndexedInstrProf::ProfVersion)Header->formatVersion(), Cur,
-                    /* UseCS */ true);
+  if (Header->Version & VARIANT_MASK_CSIR_PROF)
+    Cur = readSummary((IndexedInstrProf::ProfVersion)Header->Version, Cur,
+                      /* UseCS */ true);
   // Read the hash type and start offset.
-  IndexedInstrProf::HashT HashType = static_cast<IndexedInstrProf::HashT>(
-      endian::byte_swap<uint64_t, llvm::endianness::little>(Header->HashType));
+  IndexedInstrProf::HashT HashType =
+      static_cast<IndexedInstrProf::HashT>(Header->HashType);
   if (HashType > IndexedInstrProf::HashT::Last)
     return error(instrprof_error::unsupported_hash_type);
 
-  uint64_t HashOffset =
-      endian::byte_swap<uint64_t, llvm::endianness::little>(Header->HashOffset);
-
   // The hash table with profile counts comes next.
   auto IndexPtr = std::make_unique<InstrProfReaderIndex<OnDiskHashTableImplV3>>(
-      Start + HashOffset, Cur, Start, HashType, Header->formatVersion());
+      Start + Header->HashOffset, Cur, Start, HashType, Header->Version);
 
   // The MemProfOffset field in the header is only valid when the format
   // version is higher than 8 (when it was introduced).
-  if (GET_VERSION(Header->formatVersion()) >= 8 &&
-      Header->formatVersion() & VARIANT_MASK_MEMPROF) {
-    uint64_t MemProfOffset =
-        endian::byte_swap<uint64_t, llvm::endianness::little>(
-            Header->MemProfOffset);
-    if (Error E = MemProfReader.deserialize(Start, MemProfOffset))
+  if (GET_VERSION(Header->Version) >= 8 &&
+      Header->Version & VARIANT_MASK_MEMPROF) {
+    if (Error E = MemProfReader.deserialize(Start, Header->MemProfOffset))
       return E;
   }
 
   // BinaryIdOffset field in the header is only valid when the format version
   // is higher than 9 (when it was introduced).
-  if (GET_VERSION(Header->formatVersion()) >= 9) {
-    uint64_t BinaryIdOffset =
-        endian::byte_swap<uint64_t, llvm::endianness::little>(
-            Header->BinaryIdOffset);
-    const unsigned char *Ptr = Start + BinaryIdOffset;
+  if (GET_VERSION(Header->Version) >= 9) {
+    const unsigned char *Ptr = Start + Header->BinaryIdOffset;
     // Read binary ids size.
     BinaryIdsSize =
         support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
@@ -1360,11 +1349,8 @@ Error IndexedInstrProfReader::readHeader() {
                                         "corrupted binary ids");
   }
 
-  if (GET_VERSION(Header->formatVersion()) >= 12) {
-    uint64_t VTableNamesOffset =
-        endian::byte_swap<uint64_t, llvm::endianness::little>(
-            Header->VTableNamesOffset);
-    const unsigned char *Ptr = Start + VTableNamesOffset;
+  if (GET_VERSION(Header->Version) >= 12) {
+    const unsigned char *Ptr = Start + Header->VTableNamesOffset;
 
     CompressedVTableNamesLen =
         support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
@@ -1376,12 +1362,9 @@ Error IndexedInstrProfReader::readHeader() {
       return make_error<InstrProfError>(instrprof_error::truncated);
   }
 
-  if (GET_VERSION(Header->formatVersion()) >= 10 &&
-      Header->formatVersion() & VARIANT_MASK_TEMPORAL_PROF) {
-    uint64_t TemporalProfTracesOffset =
-        endian::byte_swap<uint64_t, llvm::endianness::little>(
-            Header->TemporalProfTracesOffset);
-    const unsigned char *Ptr = Start + TemporalProfTracesOffset;
+  if (GET_VERSION(Header->Version) >= 10 &&
+      Header->Version & VARIANT_MASK_TEMPORAL_PROF) {
+    const unsigned char *Ptr = Start + Header->TemporalProfTracesOffset;
     const auto *PtrEnd = (const unsigned char *)DataBuffer->getBufferEnd();
     // Expect at least two 64 bit fields: NumTraces, and TraceStreamSize
     if (Ptr + 2 * sizeof(uint64_t) > PtrEnd)
@@ -1506,6 +1489,55 @@ Expected<InstrProfRecord> IndexedInstrProfReader::getInstrProfRecord(
   return error(instrprof_error::unknown_function);
 }
 
+static Expected<memprof::MemProfRecord>
+getMemProfRecordV0(const memprof::IndexedMemProfRecord &IndexedRecord,
+                   MemProfFrameHashTable &MemProfFrameTable) {
+  memprof::FrameIdConverter<MemProfFrameHashTable> FrameIdConv(
+      MemProfFrameTable);
+
+  memprof::MemProfRecord Record =
+      memprof::MemProfRecord(IndexedRecord, FrameIdConv);
+
+  // Check that all frame ids were successfully converted to frames.
+  if (FrameIdConv.LastUnmappedId) {
+    return make_error<InstrProfError>(instrprof_error::hash_mismatch,
+                                      "memprof frame not found for frame id " +
+                                          Twine(*FrameIdConv.LastUnmappedId));
+  }
+
+  return Record;
+}
+
+static Expected<memprof::MemProfRecord>
+getMemProfRecordV2(const memprof::IndexedMemProfRecord &IndexedRecord,
+                   MemProfFrameHashTable &MemProfFrameTable,
+                   MemProfCallStackHashTable &MemProfCallStackTable) {
+  memprof::FrameIdConverter<MemProfFrameHashTable> FrameIdConv(
+      MemProfFrameTable);
+
+  memprof::CallStackIdConverter<MemProfCallStackHashTable> CSIdConv(
+      MemProfCallStackTable, FrameIdConv);
+
+  memprof::MemProfRecord Record = IndexedRecord.toMemProfRecord(CSIdConv);
+
+  // Check that all call stack ids were successfully converted to call stacks.
+  if (CSIdConv.LastUnmappedId) {
+    return make_error<InstrProfError>(
+        instrprof_error::hash_mismatch,
+        "memprof call stack not found for call stack id " +
+            Twine(*CSIdConv.LastUnmappedId));
+  }
+
+  // Check that all frame ids were successfully converted to frames.
+  if (FrameIdConv.LastUnmappedId) {
+    return make_error<InstrProfError>(instrprof_error::hash_mismatch,
+                                      "memprof frame not found for frame id " +
+                                          Twine(*FrameIdConv.LastUnmappedId));
+  }
+
+  return Record;
+}
+
 Expected<memprof::MemProfRecord>
 IndexedMemProfReader::getMemProfRecord(const uint64_t FuncNameHash) const {
   // TODO: Add memprof specific errors.
@@ -1518,41 +1550,27 @@ IndexedMemProfReader::getMemProfRecord(const uint64_t FuncNameHash) const {
         instrprof_error::unknown_function,
         "memprof record not found for function hash " + Twine(FuncNameHash));
 
-  // Setup a callback to convert from frame ids to frame using the on-disk
-  // FrameData hash table.
-  memprof::FrameIdConverter<MemProfFrameHashTable> FrameIdConv(
-      *MemProfFrameTable.get());
-
   const memprof::IndexedMemProfRecord IndexedRecord = *Iter;
-  memprof::MemProfRecord Record;
-  if (MemProfCallStackTable) {
-    // Setup a callback to convert call stack ids to call stacks using the
-    // on-disk hash table.
-    memprof::CallStackIdConverter<MemProfCallStackHashTable> CSIdConv(
-        *MemProfCallStackTable.get(), FrameIdConv);
-
-    Record = IndexedRecord.toMemProfRecord(CSIdConv);
-
-    // Check that all call stack ids were successfully converted to call stacks.
-    if (CSIdConv.LastUnmappedId) {
-      return make_error<InstrProfError>(
-          instrprof_error::hash_mismatch,
-          "memprof call stack not found for call stack id " +
-              Twine(*CSIdConv.LastUnmappedId));
-    }
-  } else {
-    Record = memprof::MemProfRecord(IndexedRecord, FrameIdConv);
-  }
-
-  // Check that all frame ids were successfully converted to frames.
-  if (FrameIdConv.LastUnmappedId) {
-    return make_error<InstrProfError>(
-        instrprof_error::hash_mismatch,
-        "memprof frame not found for frame id " +
-            Twine(*FrameIdConv.LastUnmappedId));
+  switch (Version) {
+  case memprof::Version0:
+  case memprof::Version1:
+    assert(MemProfFrameTable && "MemProfFrameTable must be available");
+    assert(!MemProfCallStackTable &&
+           "MemProfCallStackTable must not be available");
+    return getMemProfRecordV0(IndexedRecord, *MemProfFrameTable);
+  case memprof::Version2:
+    assert(MemProfFrameTable && "MemProfFrameTable must be available");
+    assert(MemProfCallStackTable && "MemProfCallStackTable must be available");
+    return getMemProfRecordV2(IndexedRecord, *MemProfFrameTable,
+                              *MemProfCallStackTable);
   }
 
-  return Record;
+  return make_error<InstrProfError>(
+      instrprof_error::unsupported_version,
+      formatv("MemProf version {} not supported; "
+              "requires version between {} and {}, inclusive",
+              Version, memprof::MinimumSupportedVersion,
+              memprof::MaximumSupportedVersion));
 }
 
 Error IndexedInstrProfReader::getFunctionCounts(StringRef FuncName,
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 101992c38353..b67a9700b680 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -660,6 +660,37 @@ uint64_t InstrProfWriter::writeHeader(const IndexedInstrProf::Header &Header,
   return BackPatchStartOffset;
 }
 
+Error InstrProfWriter::writeVTableNames(ProfOStream &OS) {
+  std::vector<std::string> VTableNameStrs;
+  for (StringRef VTableName : VTableNames.keys())
+    VTableNameStrs.push_back(VTableName.str());
+
+  std::string CompressedVTableNames;
+  if (!VTableNameStrs.empty())
+    if (Error E = collectGlobalObjectNameStrings(
+            VTableNameStrs, compression::zlib::isAvailable(),
+            CompressedVTableNames))
+      return E;
+
+  const uint64_t CompressedStringLen = CompressedVTableNames.length();
+
+  // Record the length of compressed string.
+  OS.write(CompressedStringLen);
+
+  // Write the chars in compressed strings.
+  for (auto &c : CompressedVTableNames)
+    OS.writeByte(static_cast<uint8_t>(c));
+
+  // Pad up to a multiple of 8.
+  // InstrProfReader could read bytes according to 'CompressedStringLen'.
+  const uint64_t PaddedLength = alignTo(CompressedStringLen, 8);
+
+  for (uint64_t K = CompressedStringLen; K < PaddedLength; K++)
+    OS.writeByte(0);
+
+  return Error::success();
+}
+
 Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   using namespace IndexedInstrProf;
   using namespace support;
@@ -682,7 +713,6 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
 
   // Write the header.
   IndexedInstrProf::Header Header;
-  Header.Magic = IndexedInstrProf::Magic;
   Header.Version = WritePrevVersion
                        ? IndexedInstrProf::ProfVersion::Version11
                        : IndexedInstrProf::ProfVersion::CurrentVersion;
@@ -706,14 +736,6 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
   if (static_cast<bool>(ProfileKind & InstrProfKind::TemporalProfile))
     Header.Version |= VARIANT_MASK_TEMPORAL_PROF;
 
-  Header.Unused = 0;
-  Header.HashType = static_cast<uint64_t>(IndexedInstrProf::HashType);
-  Header.HashOffset = 0;
-  Header.MemProfOffset = 0;
-  Header.BinaryIdOffset = 0;
-  Header.TemporalProfTracesOffset = 0;
-  Header.VTableNamesOffset = 0;
-
   const uint64_t BackPatchStartOffset =
       writeHeader(Header, WritePrevVersion, OS);
 
@@ -784,34 +806,9 @@ Error InstrProfWriter::writeImpl(ProfOStream &OS) {
 
   uint64_t VTableNamesSectionStart = OS.tell();
 
-  if (!WritePrevVersion) {
-    std::vector<std::string> VTableNameStrs;
-    for (StringRef VTableName : VTableNames.keys())
-      VTableNameStrs.push_back(VTableName.str());
-
-    std::string CompressedVTableNames;
-    if (!VTableNameStrs.empty())
-      if (Error E = collectGlobalObjectNameStrings(
-              VTableNameStrs, compression::zlib::isAvailable(),
-              CompressedVTableNames))
-        return E;
-
-    const uint64_t CompressedStringLen = CompressedVTableNames.length();
-
-    // Record the length of compressed string.
-    OS.write(CompressedStringLen);
-
-    // Write the chars in compressed strings.
-    for (auto &c : CompressedVTableNames)
-      OS.writeByte(static_cast<uint8_t>(c));
-
-    // Pad up to a multiple of 8.
-    // InstrProfReader could read bytes according to 'CompressedStringLen'.
-    const uint64_t PaddedLength = alignTo(CompressedStringLen, 8);
-
-    for (uint64_t K = CompressedStringLen; K < PaddedLength; K++)
-      OS.writeByte(0);
-  }
+  if (!WritePrevVersion)
+    if (Error E = writeVTableNames(OS))
+      return E;
 
   uint64_t TemporalProfTracesSectionStart = 0;
   if (static_cast<bool>(ProfileKind & InstrProfKind::TemporalProfile)) {
diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
index f5789186094c..e5608644519d 100644
--- a/llvm/lib/ProfileData/MemProf.cpp
+++ b/llvm/lib/ProfileData/MemProf.cpp
@@ -208,6 +208,7 @@ static IndexedMemProfRecord deserializeV2(const MemProfSchema &Schema,
   // Read the meminfo nodes.
   const uint64_t NumNodes =
       endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
+  Record.AllocSites.reserve(NumNodes);
   for (uint64_t I = 0; I < NumNodes; I++) {
     IndexedAllocationInfo Node;
     Node.CSId = endian::readNext<CallStackId, llvm::endianness::little>(Ptr);
@@ -219,6 +220,7 @@ static IndexedMemProfRecord deserializeV2(const MemProfSchema &Schema,
   // Read the callsite information.
   const uint64_t NumCtxs =
       endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
+  Record.CallSiteIds.reserve(NumCtxs);
   for (uint64_t J = 0; J < NumCtxs; J++) {
     CallStackId CSId =
         endian::readNext<CallStackId, llvm::endianness::little>(Ptr);
@@ -247,13 +249,15 @@ MemProfRecord IndexedMemProfRecord::toMemProfRecord(
         Callback) const {
   MemProfRecord Record;
 
+  Record.AllocSites.reserve(AllocSites.size());
   for (const memprof::IndexedAllocationInfo &IndexedAI : AllocSites) {
     memprof::AllocationInfo AI;
     AI.Info = IndexedAI.Info;
     AI.CallStack = Callback(IndexedAI.CSId);
-    Record.AllocSites.push_back(AI);
+    Record.AllocSites.push_back(std::move(AI));
   }
 
+  Record.CallSites.reserve(CallSiteIds.size());
   for (memprof::CallStackId CSId : CallSiteIds)
     Record.CallSites.push_back(Callback(CSId));
 
diff --git a/llvm/lib/ProfileData/MemProfReader.cpp b/llvm/lib/ProfileData/MemProfReader.cpp
index c25babac844a..fc3be716087e 100644
--- a/llvm/lib/ProfileData/MemProfReader.cpp
+++ b/llvm/lib/ProfileData/MemProfReader.cpp
@@ -587,31 +587,27 @@ Error RawMemProfReader::symbolizeAndFilterStackFrames(
 std::vector<std::string>
 RawMemProfReader::peekBuildIds(MemoryBuffer *DataBuffer) {
   const char *Next = DataBuffer->getBufferStart();
-  // Use a set + vector since a profile file may contain multiple raw profile
+  // Use a SetVector since a profile file may contain multiple raw profile
   // dumps, each with segment information. We want them unique and in order they
   // were stored in the profile; the profiled binary should be the first entry.
   // The runtime uses dl_iterate_phdr and the "... first object visited by
   // callback is the main program."
   // https://man7.org/linux/man-pages/man3/dl_iterate_phdr.3.html
-  std::vector<std::string> BuildIds;
-  llvm::SmallSet<std::string, 10> BuildIdsSet;
+  llvm::SetVector<std::string, std::vector<std::string>,
+                  llvm::SmallSet<std::string, 10>>
+      BuildIds;
   while (Next < DataBuffer->getBufferEnd()) {
     auto *Header = reinterpret_cast<const memprof::Header *>(Next);
 
     const llvm::SmallVector<SegmentEntry> Entries =
         readSegmentEntries(Next + Header->SegmentOffset);
 
-    for (const auto &Entry : Entries) {
-      const std::string Id = getBuildIdString(Entry);
-      if (BuildIdsSet.contains(Id))
-        continue;
-      BuildIds.push_back(Id);
-      BuildIdsSet.insert(Id);
-    }
+    for (const auto &Entry : Entries)
+      BuildIds.insert(getBuildIdString(Entry));
 
     Next += Header->TotalSize;
   }
-  return BuildIds;
+  return BuildIds.takeVector();
 }
 
 Error RawMemProfReader::readRawProfile(
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 03e888958a07..be4badc09efa 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -56,9 +56,6 @@ elseif( CMAKE_HOST_UNIX )
     STRING(REGEX REPLACE "^lib" "" Backtrace_LIBFILE ${Backtrace_LIBFILE})
     set(system_libs ${system_libs} ${Backtrace_LIBFILE})
   endif()
-  if( LLVM_ENABLE_TERMINFO )
-    set(imported_libs ${imported_libs} Terminfo::terminfo)
-  endif()
   set(system_libs ${system_libs} ${LLVM_ATOMIC_LIB})
   set(system_libs ${system_libs} ${LLVM_PTHREAD_LIB})
   if( UNIX AND NOT (BEOS OR HAIKU) )
@@ -325,14 +322,6 @@ if(LLVM_ENABLE_ZSTD)
   set(llvm_system_libs ${llvm_system_libs} "${zstd_library}")
 endif()
 
-if(LLVM_ENABLE_TERMINFO)
-  if(NOT terminfo_library)
-    get_property(terminfo_library TARGET Terminfo::terminfo PROPERTY LOCATION)
-  endif()
-  get_library_name(${terminfo_library} terminfo_library)
-  set(llvm_system_libs ${llvm_system_libs} "${terminfo_library}")
-endif()
-
 set_property(TARGET LLVMSupport PROPERTY LLVM_SYSTEM_LIBS "${llvm_system_libs}")
 
 
diff --git a/llvm/lib/Support/Error.cpp b/llvm/lib/Support/Error.cpp
index 21d591530b41..34ec31e3b833 100644
--- a/llvm/lib/Support/Error.cpp
+++ b/llvm/lib/Support/Error.cpp
@@ -135,6 +135,9 @@ StringError::StringError(std::error_code EC, const Twine &S)
 StringError::StringError(const Twine &S, std::error_code EC)
     : Msg(S.str()), EC(EC), PrintMsgOnly(true) {}
 
+StringError::StringError(std::string &&S, std::error_code EC, bool PrintMsgOnly)
+    : Msg(S), EC(EC), PrintMsgOnly(PrintMsgOnly) {}
+
 void StringError::log(raw_ostream &OS) const {
   if (PrintMsgOnly) {
     OS << Msg;
@@ -149,7 +152,7 @@ std::error_code StringError::convertToErrorCode() const {
   return EC;
 }
 
-Error createStringError(std::error_code EC, char const *Msg) {
+Error createStringError(std::string &&Msg, std::error_code EC) {
   return make_error<StringError>(Msg, EC);
 }
 
diff --git a/llvm/lib/Support/Unix/Process.inc b/llvm/lib/Support/Unix/Process.inc
index ae90924cae1b..84b10ff5d1d0 100644
--- a/llvm/lib/Support/Unix/Process.inc
+++ b/llvm/lib/Support/Unix/Process.inc
@@ -341,17 +341,9 @@ unsigned Process::StandardErrColumns() {
   return getColumns();
 }
 
-#ifdef LLVM_ENABLE_TERMINFO
-// We manually declare these extern functions because finding the correct
-// headers from various terminfo, curses, or other sources is harder than
-// writing their specs down.
-extern "C" int setupterm(char *term, int filedes, int *errret);
-extern "C" struct term *set_curterm(struct term *termp);
-extern "C" int del_curterm(struct term *termp);
-extern "C" int tigetnum(char *capname);
-#endif
-
-bool checkTerminalEnvironmentForColors() {
+static bool terminalHasColors() {
+  // Check if the current terminal is one of terminals that are known to support
+  // ANSI color escape codes.
   if (const char *TermStr = std::getenv("TERM")) {
     return StringSwitch<bool>(TermStr)
         .Case("ansi", true)
@@ -368,54 +360,10 @@ bool checkTerminalEnvironmentForColors() {
   return false;
 }
 
-static bool terminalHasColors(int fd) {
-#ifdef LLVM_ENABLE_TERMINFO
-  // First, acquire a global lock because these C routines are thread hostile.
-  static std::mutex TermColorMutex;
-  std::lock_guard<std::mutex> G(TermColorMutex);
-
-  struct term *previous_term = set_curterm(nullptr);
-  int errret = 0;
-  if (setupterm(nullptr, fd, &errret) != 0)
-    // Regardless of why, if we can't get terminfo, we shouldn't try to print
-    // colors.
-    return false;
-
-  // Test whether the terminal as set up supports color output. How to do this
-  // isn't entirely obvious. We can use the curses routine 'has_colors' but it
-  // would be nice to avoid a dependency on curses proper when we can make do
-  // with a minimal terminfo parsing library. Also, we don't really care whether
-  // the terminal supports the curses-specific color changing routines, merely
-  // if it will interpret ANSI color escape codes in a reasonable way. Thus, the
-  // strategy here is just to query the baseline colors capability and if it
-  // supports colors at all to assume it will translate the escape codes into
-  // whatever range of colors it does support. We can add more detailed tests
-  // here if users report them as necessary.
-  //
-  // The 'tigetnum' routine returns -2 or -1 on errors, and might return 0 if
-  // the terminfo says that no colors are supported.
-  int colors_ti = tigetnum(const_cast<char *>("colors"));
-  bool HasColors =
-      colors_ti >= 0 ? colors_ti : checkTerminalEnvironmentForColors();
-
-  // Now extract the structure allocated by setupterm and free its memory
-  // through a really silly dance.
-  struct term *termp = set_curterm(previous_term);
-  (void)del_curterm(termp); // Drop any errors here.
-
-  // Return true if we found a color capabilities for the current terminal.
-  return HasColors;
-#else
-  // When the terminfo database is not available, check if the current terminal
-  // is one of terminals that are known to support ANSI color escape codes.
-  return checkTerminalEnvironmentForColors();
-#endif
-}
-
 bool Process::FileDescriptorHasColors(int fd) {
   // A file descriptor has colors if it is displayed and the terminal has
   // colors.
-  return FileDescriptorIsDisplayed(fd) && terminalHasColors(fd);
+  return FileDescriptorIsDisplayed(fd) && terminalHasColors();
 }
 
 bool Process::StandardOutHasColors() {
diff --git a/llvm/lib/Support/raw_socket_stream.cpp b/llvm/lib/Support/raw_socket_stream.cpp
index 14e2308df4d7..549d537709bf 100644
--- a/llvm/lib/Support/raw_socket_stream.cpp
+++ b/llvm/lib/Support/raw_socket_stream.cpp
@@ -204,17 +204,26 @@ ListeningSocket::accept(std::chrono::milliseconds Timeout) {
     auto Start = std::chrono::steady_clock::now();
 #ifdef _WIN32
     PollStatus = WSAPoll(FDs, 2, RemainingTime);
-    if (PollStatus == SOCKET_ERROR) {
 #else
     PollStatus = ::poll(FDs, 2, RemainingTime);
+#endif
+    // If FD equals -1 then ListeningSocket::shutdown has been called and it is
+    // appropriate to return operation_canceled
+    if (FD.load() == -1)
+      return llvm::make_error<StringError>(
+          std::make_error_code(std::errc::operation_canceled),
+          "Accept canceled");
+
+#if _WIN32
+    if (PollStatus == SOCKET_ERROR) {
+#else
     if (PollStatus == -1) {
 #endif
-      // Ignore error if caused by interupting signal
       std::error_code PollErrCode = getLastSocketErrorCode();
+      // Ignore EINTR (signal occured before any request event) and retry
       if (PollErrCode != std::errc::interrupted)
         return llvm::make_error<StringError>(PollErrCode, "FD poll failed");
     }
-
     if (PollStatus == 0)
       return llvm::make_error<StringError>(
           std::make_error_code(std::errc::timed_out),
@@ -222,13 +231,7 @@ ListeningSocket::accept(std::chrono::milliseconds Timeout) {
 
     if (FDs[0].revents & POLLNVAL)
       return llvm::make_error<StringError>(
-          std::make_error_code(std::errc::bad_file_descriptor),
-          "File descriptor closed by another thread");
-
-    if (FDs[1].revents & POLLIN)
-      return llvm::make_error<StringError>(
-          std::make_error_code(std::errc::operation_canceled),
-          "Accept canceled");
+          std::make_error_code(std::errc::bad_file_descriptor));
 
     auto Stop = std::chrono::steady_clock::now();
     ElapsedTime +=
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
index bfcafc6442d2..9a804c12939c 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -38,6 +38,8 @@ static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
 static const MCPhysReg ZRegList[] = {AArch64::Z0, AArch64::Z1, AArch64::Z2,
                                      AArch64::Z3, AArch64::Z4, AArch64::Z5,
                                      AArch64::Z6, AArch64::Z7};
+static const MCPhysReg PRegList[] = {AArch64::P0, AArch64::P1, AArch64::P2,
+                                     AArch64::P3};
 
 static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
                              MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
@@ -59,11 +61,17 @@ static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
     // CCAssignFn again we want it to behave as if all remaining registers are
     // allocated. This will force the code to pass the tuple indirectly in
     // accordance with the PCS.
-    bool RegsAllocated[8];
+    bool ZRegsAllocated[8];
     for (int I = 0; I < 8; I++) {
-      RegsAllocated[I] = State.isAllocated(ZRegList[I]);
+      ZRegsAllocated[I] = State.isAllocated(ZRegList[I]);
       State.AllocateReg(ZRegList[I]);
     }
+    // The same applies to P registers.
+    bool PRegsAllocated[4];
+    for (int I = 0; I < 4; I++) {
+      PRegsAllocated[I] = State.isAllocated(PRegList[I]);
+      State.AllocateReg(PRegList[I]);
+    }
 
     auto &It = PendingMembers[0];
     CCAssignFn *AssignFn =
@@ -79,8 +87,11 @@ static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
     // Return the register state back to how it was before, leaving any
     // unallocated registers available for other smaller types.
     for (int I = 0; I < 8; I++)
-      if (!RegsAllocated[I])
+      if (!ZRegsAllocated[I])
         State.DeallocateReg(ZRegList[I]);
+    for (int I = 0; I < 4; I++)
+      if (!PRegsAllocated[I])
+        State.DeallocateReg(PRegList[I]);
 
     // All pending members have now been allocated
     PendingMembers.clear();
@@ -140,9 +151,15 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
     RegList = DRegList;
   else if (LocVT.SimpleTy == MVT::f128 || LocVT.is128BitVector())
     RegList = QRegList;
-  else if (LocVT.isScalableVector())
-    RegList = ZRegList;
-  else {
+  else if (LocVT.isScalableVector()) {
+    // Scalable masks should be pass by Predicate registers.
+    if (LocVT == MVT::nxv1i1 || LocVT == MVT::nxv2i1 || LocVT == MVT::nxv4i1 ||
+        LocVT == MVT::nxv8i1 || LocVT == MVT::nxv16i1 ||
+        LocVT == MVT::aarch64svcount)
+      RegList = PRegList;
+    else
+      RegList = ZRegList;
+  } else {
     // Not an array we want to split up after all.
     return false;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 10cad6d19244..1c7f6b870d39 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -295,5 +295,6 @@ def AArch64PostLegalizerCombiner
                         ptr_add_immed_chain, overlapping_and,
                         split_store_zero_128, undef_combines,
                         select_to_minmax, or_to_bsp, combine_concat_vector,
-                        commute_constant_to_rhs]> {
+                        commute_constant_to_rhs,
+                        push_freeze_to_prevent_poison_from_propagating]> {
 }
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index ba0b760ce3d7..ffb899a30145 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -223,13 +223,6 @@ def FeatureSVE : Extension<"sve", "SVE",
   "Enable Scalable Vector Extension (SVE) instructions (FEAT_SVE)", [FeatureFullFP16],
   "FEAT_SVE", "+sve,+fullfp16,+fp-armv8,+neon", 310>;
 
-def FeatureFPMR : Extension<"fpmr", "FPMR",
-  "Enable FPMR Register (FEAT_FPMR)">;
-
-let FMVDependencies = "+fpmr" in
-def FeatureFP8 : Extension<"fp8", "FP8",
-  "Enable FP8 instructions (FEAT_FP8)">;
-
 // This flag is currently still labeled as Experimental, but when fully
 // implemented this should tell the compiler to use the zeroing pseudos to
 // benefit from the reverse instructions (e.g. SUB vs SUBR) if the inactive
@@ -667,41 +660,44 @@ def FeatureSME2p1 : Extension<"sme2p1", "SME2p1",
 def FeatureFAMINMAX: Extension<"faminmax", "FAMINMAX",
    "Enable FAMIN and FAMAX instructions (FEAT_FAMINMAX)">;
 
-let FMVDependencies = "+fpmr" in
+def FeatureLUT: Extension<"lut", "LUT",
+   "Enable Lookup Table instructions (FEAT_LUT)">;
+   
+def FeatureFP8 : Extension<"fp8", "FP8",
+  "Enable FP8 instructions (FEAT_FP8)", [FeatureFAMINMAX, FeatureLUT, FeatureBF16]>;
+  
 def FeatureFP8FMA : Extension<"fp8fma", "FP8FMA",
-  "Enable fp8 multiply-add instructions (FEAT_FP8FMA)">;
+  "Enable fp8 multiply-add instructions (FEAT_FP8FMA)", [FeatureFP8]>;
 
 let FMVDependencies = "+sme2" in
 def FeatureSSVE_FP8FMA : Extension<"ssve-fp8fma", "SSVE_FP8FMA",
-  "Enable SVE2 fp8 multiply-add instructions (FEAT_SSVE_FP8FMA)", [FeatureSME2]>;
+  "Enable SVE2 fp8 multiply-add instructions (FEAT_SSVE_FP8FMA)", [FeatureSME2, FeatureFP8]>;
 
+def FeatureFP8DOT4: Extension<"fp8dot4", "FP8DOT4",
+   "Enable fp8 4-way dot instructions (FEAT_FP8DOT4)", [FeatureFP8FMA]>;
+  
 def FeatureFP8DOT2: Extension<"fp8dot2", "FP8DOT2",
-   "Enable fp8 2-way dot instructions (FEAT_FP8DOT2)">;
+   "Enable fp8 2-way dot instructions (FEAT_FP8DOT2)", [FeatureFP8DOT4]>;
 
 let FMVDependencies = "+sme2" in
-def FeatureSSVE_FP8DOT2 : Extension<"ssve-fp8dot2", "SSVE_FP8DOT2",
-  "Enable SVE2 fp8 2-way dot product instructions (FEAT_SSVE_FP8DOT2)", [FeatureSME2]>;
-
-def FeatureFP8DOT4: Extension<"fp8dot4", "FP8DOT4",
-   "Enable fp8 4-way dot instructions (FEAT_FP8DOT4)">;
+def FeatureSSVE_FP8DOT4 : Extension<"ssve-fp8dot4", "SSVE_FP8DOT4",
+  "Enable SVE2 fp8 4-way dot product instructions (FEAT_SSVE_FP8DOT4)", [FeatureSSVE_FP8FMA]>;
 
 let FMVDependencies = "+sme2" in
-def FeatureSSVE_FP8DOT4 : Extension<"ssve-fp8dot4", "SSVE_FP8DOT4",
-  "Enable SVE2 fp8 4-way dot product instructions (FEAT_SSVE_FP8DOT4)", [FeatureSME2]>;
-def FeatureLUT: Extension<"lut", "LUT",
-   "Enable Lookup Table instructions (FEAT_LUT)">;
+def FeatureSSVE_FP8DOT2 : Extension<"ssve-fp8dot2", "SSVE_FP8DOT2",
+  "Enable SVE2 fp8 2-way dot product instructions (FEAT_SSVE_FP8DOT2)", [FeatureSSVE_FP8DOT4]>;
 
 def FeatureSME_LUTv2 : Extension<"sme-lutv2", "SME_LUTv2",
   "Enable Scalable Matrix Extension (SME) LUTv2 instructions (FEAT_SME_LUTv2)">;
 
-let FMVDependencies = "+fp8,+sme2" in
-def FeatureSMEF8F16 : Extension<"sme-f8f16", "SMEF8F16",
-  "Enable Scalable Matrix Extension (SME) F8F16 instructions(FEAT_SME_F8F16)", [FeatureSME2, FeatureFP8]>;
-
 let FMVDependencies = "+sme2,+fp8" in
 def FeatureSMEF8F32 : Extension<"sme-f8f32", "SMEF8F32",
   "Enable Scalable Matrix Extension (SME) F8F32 instructions (FEAT_SME_F8F32)", [FeatureSME2, FeatureFP8]>;
 
+let FMVDependencies = "+fp8,+sme2" in
+def FeatureSMEF8F16 : Extension<"sme-f8f16", "SMEF8F16",
+  "Enable Scalable Matrix Extension (SME) F8F16 instructions(FEAT_SME_F8F16)", [FeatureSMEF8F32]>;
+
 def FeatureAppleA7SysReg  : SubtargetFeature<"apple-a7-sysreg", "HasAppleA7SysReg", "true",
   "Apple A7 (the CPU formerly known as Cyclone)">;
 
@@ -869,7 +865,7 @@ def HasV9_4aOps : Architecture64<9, 4, "a", "v9.4a",
     FeatureRASv2])>;
 def HasV9_5aOps : Architecture64<9, 5, "a", "v9.5a",
   [HasV9_4aOps, FeatureCPA],
-  !listconcat(HasV9_4aOps.DefaultExts, [FeatureCPA])>;
+  !listconcat(HasV9_4aOps.DefaultExts, [FeatureCPA,  FeatureLUT, FeatureFAMINMAX])>;
 def HasV8_0rOps : Architecture64<8, 0, "r", "v8r",
   [ //v8.1
     FeatureCRC, FeaturePAN, FeatureLSE, FeatureCONTEXTIDREL2,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e31a27e9428e..25ba8d850030 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1871,9 +1871,11 @@ bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
   if (!Subtarget->hasSVEorSME())
     return true;
 
-  // We can only use the BRKB + CNTP sequence with legal predicate types.
+  // We can only use the BRKB + CNTP sequence with legal predicate types. We can
+  // also support fixed-width predicates.
   return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
-         VT != MVT::nxv2i1;
+         VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
+         VT != MVT::v4i1 && VT != MVT::v2i1;
 }
 
 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
@@ -5838,9 +5840,20 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return SDValue();
   }
   case Intrinsic::experimental_cttz_elts: {
-    SDValue NewCttzElts =
-        DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1));
+    SDValue CttzOp = Op.getOperand(1);
+    EVT VT = CttzOp.getValueType();
+    assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
 
+    if (VT.isFixedLengthVector()) {
+      // We can use SVE instructions to lower this intrinsic by first creating
+      // an SVE predicate register mask from the fixed-width vector.
+      EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
+      SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, NewVT, CttzOp);
+      CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
+    }
+
+    SDValue NewCttzElts =
+        DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, CttzOp);
     return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
   }
   }
@@ -7235,7 +7248,6 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
       unsigned NumParts = 1;
       if (Ins[i].Flags.isInConsecutiveRegs()) {
-        assert(!Ins[i].Flags.isInConsecutiveRegsLast());
         while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
           ++NumParts;
       }
@@ -8232,7 +8244,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       uint64_t PartSize = StoreSize;
       unsigned NumParts = 1;
       if (Outs[i].Flags.isInConsecutiveRegs()) {
-        assert(!Outs[i].Flags.isInConsecutiveRegsLast());
         while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
           ++NumParts;
         StoreSize *= NumParts;
@@ -13530,11 +13541,9 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
                       DAG.getConstant(NumElts, dl, MVT::i64));
 
       if (Even && !Odd)
-        return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
-                           RHS);
+        return DAG.getNode(AArch64ISD::UZP1, dl, VT, LHS, RHS);
       if (Odd && !Even)
-        return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
-                           RHS);
+        return DAG.getNode(AArch64ISD::UZP2, dl, VT, LHS, RHS);
     }
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index a39e3b7be76d..4830033b2352 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -171,8 +171,6 @@ def HasSME2          : Predicate<"Subtarget->hasSME2()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSME2), "sme2">;
 def HasSME2p1        : Predicate<"Subtarget->hasSME2p1()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSME2p1), "sme2p1">;
-def HasFPMR          : Predicate<"Subtarget->hasFPMR()">,
-                                 AssemblerPredicateWithAll<(all_of FeatureFPMR), "fpmr">;
 def HasFP8           : Predicate<"Subtarget->hasFP8()">,
                                  AssemblerPredicateWithAll<(all_of FeatureFP8), "fp8">;
 def HasFAMINMAX      : Predicate<"Subtarget->hasFAMINMAX()">,
diff --git a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
index abde099be382..90bf089dbebf 100644
--- a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
@@ -231,7 +231,7 @@ MachineMemOperand *createCheckMemOperand(MachineFunction &MF,
 
 } // namespace
 
-MachineBasicBlock &llvm::AArch64PAuth::checkAuthenticatedRegister(
+void llvm::AArch64PAuth::checkAuthenticatedRegister(
     MachineBasicBlock::iterator MBBI, AuthCheckMethod Method,
     Register AuthenticatedReg, Register TmpReg, bool UseIKey, unsigned BrkImm) {
 
@@ -246,13 +246,13 @@ MachineBasicBlock &llvm::AArch64PAuth::checkAuthenticatedRegister(
   default:
     break;
   case AuthCheckMethod::None:
-    return MBB;
+    return;
   case AuthCheckMethod::DummyLoad:
     BuildMI(MBB, MBBI, DL, TII->get(AArch64::LDRWui), getWRegFromXReg(TmpReg))
         .addReg(AuthenticatedReg)
         .addImm(0)
         .addMemOperand(createCheckMemOperand(MF, Subtarget));
-    return MBB;
+    return;
   }
 
   // Control flow has to be changed, so arrange new MBBs.
@@ -287,7 +287,7 @@ MachineBasicBlock &llvm::AArch64PAuth::checkAuthenticatedRegister(
         .addReg(TmpReg)
         .addImm(62)
         .addMBB(BreakBlock);
-    return *SuccessBlock;
+    return;
   case AuthCheckMethod::XPACHint:
     assert(AuthenticatedReg == AArch64::LR &&
            "XPACHint mode is only compatible with checking the LR register");
@@ -304,7 +304,7 @@ MachineBasicBlock &llvm::AArch64PAuth::checkAuthenticatedRegister(
     BuildMI(CheckBlock, DL, TII->get(AArch64::Bcc))
         .addImm(AArch64CC::NE)
         .addMBB(BreakBlock);
-    return *SuccessBlock;
+    return;
   }
   llvm_unreachable("Unknown AuthCheckMethod enum");
 }
diff --git a/llvm/lib/Target/AArch64/AArch64PointerAuth.h b/llvm/lib/Target/AArch64/AArch64PointerAuth.h
index e1ceaed58abe..4ffda7478224 100644
--- a/llvm/lib/Target/AArch64/AArch64PointerAuth.h
+++ b/llvm/lib/Target/AArch64/AArch64PointerAuth.h
@@ -98,14 +98,10 @@ enum class AuthCheckMethod {
 /// using an I-key or D-key and which register can be used as temporary.
 /// If an explicit BRK instruction is used to generate an exception, BrkImm
 /// specifies its immediate operand.
-///
-/// \returns The machine basic block containing the code that is executed
-///          after the check succeeds.
-MachineBasicBlock &checkAuthenticatedRegister(MachineBasicBlock::iterator MBBI,
-                                              AuthCheckMethod Method,
-                                              Register AuthenticatedReg,
-                                              Register TmpReg, bool UseIKey,
-                                              unsigned BrkImm);
+void checkAuthenticatedRegister(MachineBasicBlock::iterator MBBI,
+                                AuthCheckMethod Method,
+                                Register AuthenticatedReg, Register TmpReg,
+                                bool UseIKey, unsigned BrkImm);
 
 /// Returns the number of bytes added by checkAuthenticatedRegister.
 unsigned getCheckerSizeInBytes(AuthCheckMethod Method);
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 5d185fcaefc4..8bc26eeef34d 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -64,12 +64,6 @@ ReservedRegsForRA("reserve-regs-for-regalloc", cl::desc("Reserve physical "
                   "Should only be used for testing register allocator."),
                   cl::CommaSeparated, cl::Hidden);
 
-static cl::opt<bool> ForceStreamingCompatibleSVE(
-    "force-streaming-compatible-sve",
-    cl::desc(
-        "Force the use of streaming-compatible SVE code for all functions"),
-    cl::Hidden);
-
 static cl::opt<AArch64PAuth::AuthCheckMethod>
     AuthenticatedLRCheckMethod("aarch64-authenticated-lr-check-method",
                                cl::Hidden,
@@ -316,15 +310,14 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU,
                                    const TargetMachine &TM, bool LittleEndian,
                                    unsigned MinSVEVectorSizeInBitsOverride,
                                    unsigned MaxSVEVectorSizeInBitsOverride,
-                                   bool StreamingSVEMode,
-                                   bool StreamingCompatibleSVEMode,
+                                   bool IsStreaming, bool IsStreamingCompatible,
                                    bool HasMinSize)
     : AArch64GenSubtargetInfo(TT, CPU, TuneCPU, FS),
       ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
       ReserveXRegisterForRA(AArch64::GPR64commonRegClass.getNumRegs()),
       CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
-      IsLittle(LittleEndian), StreamingSVEMode(StreamingSVEMode),
-      StreamingCompatibleSVEMode(StreamingCompatibleSVEMode),
+      IsLittle(LittleEndian), IsStreaming(IsStreaming),
+      IsStreamingCompatible(IsStreamingCompatible),
       MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
       MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
       InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU, HasMinSize)),
@@ -547,20 +540,6 @@ void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
 
 bool AArch64Subtarget::useAA() const { return UseAA; }
 
-bool AArch64Subtarget::isStreamingCompatible() const {
-  return StreamingCompatibleSVEMode || ForceStreamingCompatibleSVE;
-}
-
-bool AArch64Subtarget::isNeonAvailable() const {
-  return hasNEON() &&
-         (hasSMEFA64() || (!isStreaming() && !isStreamingCompatible()));
-}
-
-bool AArch64Subtarget::isSVEAvailable() const {
-  return hasSVE() &&
-         (hasSMEFA64() || (!isStreaming() && !isStreamingCompatible()));
-}
-
 // If return address signing is enabled, tail calls are emitted as follows:
 //
 // ```
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 3f3eefc4f680..7ef7a89b5749 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -79,8 +79,8 @@ protected:
 
   bool IsLittle;
 
-  bool StreamingSVEMode;
-  bool StreamingCompatibleSVEMode;
+  bool IsStreaming;
+  bool IsStreamingCompatible;
   unsigned MinSVEVectorSizeInBits;
   unsigned MaxSVEVectorSizeInBits;
   unsigned VScaleForTuning = 2;
@@ -120,8 +120,7 @@ public:
                    StringRef FS, const TargetMachine &TM, bool LittleEndian,
                    unsigned MinSVEVectorSizeInBitsOverride = 0,
                    unsigned MaxSVEVectorSizeInBitsOverride = 0,
-                   bool StreamingSVEMode = false,
-                   bool StreamingCompatibleSVEMode = false,
+                   bool IsStreaming = false, bool IsStreamingCompatible = false,
                    bool HasMinSize = false);
 
 // Getters for SubtargetFeatures defined in tablegen
@@ -165,20 +164,26 @@ public:
   bool isXRaySupported() const override { return true; }
 
   /// Returns true if the function has a streaming body.
-  bool isStreaming() const { return StreamingSVEMode; }
+  bool isStreaming() const { return IsStreaming; }
 
   /// Returns true if the function has a streaming-compatible body.
-  bool isStreamingCompatible() const;
+  bool isStreamingCompatible() const { return IsStreamingCompatible; }
 
   /// Returns true if the target has NEON and the function at runtime is known
   /// to have NEON enabled (e.g. the function is known not to be in streaming-SVE
   /// mode, which disables NEON instructions).
-  bool isNeonAvailable() const;
+  bool isNeonAvailable() const {
+    return hasNEON() &&
+           (hasSMEFA64() || (!isStreaming() && !isStreamingCompatible()));
+  }
 
   /// Returns true if the target has SVE and can use the full range of SVE
   /// instructions, for example because it knows the function is known not to be
   /// in streaming-SVE mode or when the target has FEAT_FA64 enabled.
-  bool isSVEAvailable() const;
+  bool isSVEAvailable() const {
+    return hasSVE() &&
+           (hasSMEFA64() || (!isStreaming() && !isStreamingCompatible()));
+  }
 
   unsigned getMinVectorRegisterBitWidth() const {
     // Don't assume any minimum vector size when PSTATE.SM may not be 0, because
diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index 0564741c4970..0b5bc97674c7 100644
--- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -1943,11 +1943,9 @@ def : RWSysReg<"PM",                0b11, 0b000, 0b0100, 0b0011, 0b001>;
 // 2023 ISA Extension
 // AArch64 Floating-point Mode Register controls behaviors of the FP8
 // instructions (FEAT_FPMR)
-let Requires = [{ {AArch64::FeatureFPMR} }] in {
 //                                 Op0   Op1    CRn     CRm     Op2
 def : ROSysReg<"ID_AA64FPFR0_EL1", 0b11, 0b000, 0b0000, 0b0100, 0b111>;
 def : RWSysReg<"FPMR",             0b11, 0b011, 0b0100, 0b0100, 0b010>;
-}
 
 // v9.5a Software Stepping Enhancements (FEAT_STEP2)
 //                                  Op0   Op1    CRn     CRm     Op2
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index df802cf42526..945ab5cf1f30 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -187,6 +187,11 @@ static cl::opt<unsigned> SVEVectorBitsMinOpt(
              "with zero meaning no minimum size is assumed."),
     cl::init(0), cl::Hidden);
 
+static cl::opt<bool> ForceStreamingCompatible(
+    "force-streaming-compatible",
+    cl::desc("Force the use of streaming-compatible code for all functions"),
+    cl::init(false), cl::Hidden);
+
 extern cl::opt<bool> EnableHomogeneousPrologEpilog;
 
 static cl::opt<bool> EnableGISelLoadStoreOptPreLegal(
@@ -408,10 +413,11 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
   StringRef FS = FSAttr.isValid() ? FSAttr.getValueAsString() : TargetFS;
   bool HasMinSize = F.hasMinSize();
 
-  bool StreamingSVEMode = F.hasFnAttribute("aarch64_pstate_sm_enabled") ||
-                          F.hasFnAttribute("aarch64_pstate_sm_body");
-  bool StreamingCompatibleSVEMode =
-      F.hasFnAttribute("aarch64_pstate_sm_compatible");
+  bool IsStreaming = F.hasFnAttribute("aarch64_pstate_sm_enabled") ||
+                     F.hasFnAttribute("aarch64_pstate_sm_body");
+  bool IsStreamingCompatible =
+      F.hasFnAttribute("aarch64_pstate_sm_compatible") ||
+      ForceStreamingCompatible;
 
   unsigned MinSVEVectorSize = 0;
   unsigned MaxSVEVectorSize = 0;
@@ -439,10 +445,9 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
 
   SmallString<512> Key;
   raw_svector_ostream(Key) << "SVEMin" << MinSVEVectorSize << "SVEMax"
-                           << MaxSVEVectorSize
-                           << "StreamingSVEMode=" << StreamingSVEMode
-                           << "StreamingCompatibleSVEMode="
-                           << StreamingCompatibleSVEMode << CPU << TuneCPU << FS
+                           << MaxSVEVectorSize << "IsStreaming=" << IsStreaming
+                           << "IsStreamingCompatible=" << IsStreamingCompatible
+                           << CPU << TuneCPU << FS
                            << "HasMinSize=" << HasMinSize;
 
   auto &I = SubtargetMap[Key];
@@ -453,12 +458,10 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
     resetTargetOptions(F);
     I = std::make_unique<AArch64Subtarget>(
         TargetTriple, CPU, TuneCPU, FS, *this, isLittle, MinSVEVectorSize,
-        MaxSVEVectorSize, StreamingSVEMode, StreamingCompatibleSVEMode,
-        HasMinSize);
+        MaxSVEVectorSize, IsStreaming, IsStreamingCompatible, HasMinSize);
   }
 
-  assert((!StreamingSVEMode || I->hasSME()) &&
-         "Expected SME to be available");
+  assert((!IsStreaming || I->hasSME()) && "Expected SME to be available");
 
   return I.get();
 }
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index c9bba9bf6314..13a68b7dcf98 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -3718,7 +3718,6 @@ static const struct Extension {
     {"sb", {AArch64::FeatureSB}},
     {"ssbs", {AArch64::FeatureSSBS}},
     {"tme", {AArch64::FeatureTME}},
-    {"fpmr", {AArch64::FeatureFPMR}},
     {"fp8", {AArch64::FeatureFP8}},
     {"faminmax", {AArch64::FeatureFAMINMAX}},
     {"fp8fma", {AArch64::FeatureFP8FMA}},
@@ -3731,7 +3730,7 @@ static const struct Extension {
     {"sme-lutv2", {AArch64::FeatureSME_LUTv2}},
     {"sme-f8f16", {AArch64::FeatureSMEF8F16}},
     {"sme-f8f32", {AArch64::FeatureSMEF8F32}},
-    {"sme-fa64",  {AArch64::FeatureSMEFA64}},
+    {"sme-fa64", {AArch64::FeatureSMEFA64}},
     {"cpa", {AArch64::FeatureCPA}},
     {"tlbiw", {AArch64::FeatureTLBIW}},
 };
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 0dd4a78f962d..6493a2ee4a93 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -430,6 +430,55 @@ public:
     return false;
   }
 
+  bool clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst,
+                            APInt &Mask) const override {
+    const MCInstrDesc &Desc = Info->get(Inst.getOpcode());
+    unsigned NumDefs = Desc.getNumDefs();
+    unsigned NumImplicitDefs = Desc.implicit_defs().size();
+    assert(Mask.getBitWidth() == NumDefs + NumImplicitDefs &&
+           "Unexpected number of bits in the mask!");
+    // 32-bit General Purpose Register class.
+    const MCRegisterClass &GPR32RC = MRI.getRegClass(AArch64::GPR32RegClassID);
+    // Floating Point Register classes.
+    const MCRegisterClass &FPR8RC = MRI.getRegClass(AArch64::FPR8RegClassID);
+    const MCRegisterClass &FPR16RC = MRI.getRegClass(AArch64::FPR16RegClassID);
+    const MCRegisterClass &FPR32RC = MRI.getRegClass(AArch64::FPR32RegClassID);
+    const MCRegisterClass &FPR64RC = MRI.getRegClass(AArch64::FPR64RegClassID);
+    const MCRegisterClass &FPR128RC =
+        MRI.getRegClass(AArch64::FPR128RegClassID);
+
+    auto ClearsSuperReg = [=](unsigned RegID) {
+      // An update to the lower 32 bits of a 64 bit integer register is
+      // architecturally defined to zero extend the upper 32 bits on a write.
+      if (GPR32RC.contains(RegID))
+        return true;
+      // SIMD&FP instructions operating on scalar data only acccess the lower
+      // bits of a register, the upper bits are zero extended on a write. For
+      // SIMD vector registers smaller than 128-bits, the upper 64-bits of the
+      // register are zero extended on a write.
+      // When VL is higher than 128 bits, any write to a SIMD&FP register sets
+      // bits higher than 128 to zero.
+      return FPR8RC.contains(RegID) || FPR16RC.contains(RegID) ||
+             FPR32RC.contains(RegID) || FPR64RC.contains(RegID) ||
+             FPR128RC.contains(RegID);
+    };
+
+    Mask.clearAllBits();
+    for (unsigned I = 0, E = NumDefs; I < E; ++I) {
+      const MCOperand &Op = Inst.getOperand(I);
+      if (ClearsSuperReg(Op.getReg()))
+        Mask.setBit(I);
+    }
+
+    for (unsigned I = 0, E = NumImplicitDefs; I < E; ++I) {
+      const MCPhysReg Reg = Desc.implicit_defs()[I];
+      if (ClearsSuperReg(Reg))
+        Mask.setBit(NumDefs + I);
+    }
+
+    return Mask.getBoolValue();
+  }
+
   std::vector<std::pair<uint64_t, uint64_t>>
   findPltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents,
                  const Triple &TargetTriple) const override {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index bd48a5f80c82..cad4a3430327 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -19,7 +19,6 @@
 #include "AMDGPU.h"
 #include "AMDGPUHSAMetadataStreamer.h"
 #include "AMDGPUResourceUsageAnalysis.h"
-#include "AMDKernelCodeT.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUInstPrinter.h"
 #include "MCTargetDesc/AMDGPUMCExpr.h"
@@ -29,6 +28,7 @@
 #include "SIMachineFunctionInfo.h"
 #include "TargetInfo/AMDGPUTargetInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/AMDKernelCodeTUtils.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -205,8 +205,9 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() {
   if (STM.isMesaKernel(F) &&
       (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
        F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
-    amd_kernel_code_t KernelCode;
+    AMDGPUMCKernelCodeT KernelCode;
     getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
+    KernelCode.validate(&STM, MF->getContext());
     getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
   }
 
@@ -1317,7 +1318,7 @@ static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
   }
 }
 
-void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
+void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
                                         const SIProgramInfo &CurrentProgramInfo,
                                         const MachineFunction &MF) const {
   const Function &F = MF.getFunction();
@@ -1328,24 +1329,22 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   MCContext &Ctx = MF.getContext();
 
-  AMDGPU::initDefaultAMDKernelCodeT(Out, &STM);
+  Out.initDefault(&STM, Ctx, /*InitMCExpr=*/false);
 
-  Out.compute_pgm_resource_registers =
-      CurrentProgramInfo.getComputePGMRSrc1(STM) |
-      (CurrentProgramInfo.getComputePGMRSrc2() << 32);
+  Out.compute_pgm_resource1_registers =
+      CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);
+  Out.compute_pgm_resource2_registers =
+      CurrentProgramInfo.getComputePGMRSrc2(Ctx);
   Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
 
-  if (getMCExprValue(CurrentProgramInfo.DynamicCallStack, Ctx))
-    Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
+  Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
 
-  AMD_HSA_BITS_SET(Out.code_properties,
-                   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
+  AMD_HSA_BITS_SET(Out.code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
                    getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
 
   const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
   if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
-    Out.code_properties |=
-      AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
   }
 
   if (UserSGPRInfo.hasDispatchPtr())
@@ -1371,10 +1370,9 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
 
   Align MaxKernArgAlign;
   Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
-  Out.wavefront_sgpr_count = getMCExprValue(CurrentProgramInfo.NumSGPR, Ctx);
-  Out.workitem_vgpr_count = getMCExprValue(CurrentProgramInfo.NumVGPR, Ctx);
-  Out.workitem_private_segment_byte_size =
-      getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx);
+  Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
+  Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
+  Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
   Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
 
   // kernarg_segment_alignment is specified as log of the alignment.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 16d8952a533e..87156f27fc6c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -17,8 +17,6 @@
 #include "SIProgramInfo.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 
-struct amd_kernel_code_t;
-
 namespace llvm {
 
 class AMDGPUMachineFunction;
@@ -29,6 +27,7 @@ class MCOperand;
 
 namespace AMDGPU {
 struct MCKernelDescriptor;
+struct AMDGPUMCKernelCodeT;
 namespace HSAMD {
 class MetadataStreamer;
 }
@@ -50,7 +49,8 @@ private:
   uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
 
   void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
-  void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo,
+  void getAmdKernelCode(AMDGPU::AMDGPUMCKernelCodeT &Out,
+                        const SIProgramInfo &KernelInfo,
                         const MachineFunction &MF) const;
 
   /// Emit register usage information so that the GPU driver
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index c11c7a57e059..e35957338da7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2526,6 +2526,14 @@ void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N) {
   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
 }
 
+void AMDGPUDAGToDAGISel::SelectPOPSExitingWaveID(SDNode *N) {
+  // TODO: Select this with a tablegen pattern. This is tricky because the
+  // intrinsic is IntrReadMem/IntrWriteMem but the instruction is not marked
+  // mayLoad/mayStore and tablegen complains about the mismatch.
+  SDValue Reg = CurDAG->getRegister(AMDGPU::SRC_POPS_EXITING_WAVE_ID, MVT::i32);
+  CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, N->getVTList(), Reg);
+}
+
 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
   switch (IntrID) {
   case Intrinsic::amdgcn_ds_gws_init:
@@ -2682,6 +2690,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
   case Intrinsic::amdgcn_ds_bvh_stack_rtn:
     SelectDSBvhStackIntrinsic(N);
     return;
+  case Intrinsic::amdgcn_pops_exiting_wave_id:
+    SelectPOPSExitingWaveID(N);
+    return;
   }
 
   SelectCode(N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index f987b747c0e2..53d25b4cf4ca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -274,6 +274,7 @@ private:
   void SelectFP_EXTEND(SDNode *N);
   void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
   void SelectDSBvhStackIntrinsic(SDNode *N);
+  void SelectPOPSExitingWaveID(SDNode *N);
   void SelectDS_GWS(SDNode *N, unsigned IntrID);
   void SelectInterpP1F16(SDNode *N);
   void SelectINTRINSIC_W_CHAIN(SDNode *N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index b48a09489653..04d9bb5cb18a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2079,6 +2079,21 @@ bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
 }
 
+bool AMDGPUInstructionSelector::selectPOPSExitingWaveID(
+    MachineInstr &MI) const {
+  Register Dst = MI.getOperand(0).getReg();
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineBasicBlock *MBB = MI.getParent();
+
+  // TODO: Select this with a tablegen pattern. This is tricky because the
+  // intrinsic is IntrReadMem/IntrWriteMem but the instruction is not marked
+  // mayLoad/mayStore and tablegen complains about the mismatch.
+  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
+                 .addReg(AMDGPU::SRC_POPS_EXITING_WAVE_ID);
+  MI.eraseFromParent();
+  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
     MachineInstr &I) const {
   unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
@@ -2129,6 +2144,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
     return selectSBarrierSignalIsfirst(I, IntrinsicID);
   case Intrinsic::amdgcn_s_barrier_leave:
     return selectSBarrierLeave(I);
+  case Intrinsic::amdgcn_pops_exiting_wave_id:
+    return selectPOPSExitingWaveID(I);
   }
   return selectImpl(I, *CoverageInfo);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index f561d5d29efc..48f3b1811801 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -125,6 +125,7 @@ private:
   bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
   bool selectSBarrier(MachineInstr &MI) const;
   bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
+  bool selectPOPSExitingWaveID(MachineInstr &MI) const;
 
   bool selectImageIntrinsic(MachineInstr &MI,
                             const AMDGPU::ImageDimIntrinsicInfo *Intr) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index aab79ceb57f2..c515138d95a2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -1215,16 +1215,36 @@ bool AMDGPULibCalls::fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B,
                                "__rootn2div");
     replaceCall(FPOp, nval);
     return true;
-  } else if (ci_opr1 == -2) { // rootn(x, -2) = rsqrt(x)
-    if (FunctionCallee FPExpr =
-            getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT, FInfo))) {
-      LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> rsqrt(" << *opr0
-                        << ")\n");
-      Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2rsqrt");
-      replaceCall(FPOp, nval);
-      return true;
-    }
   }
+
+  if (ci_opr1 == -2 &&
+      shouldReplaceLibcallWithIntrinsic(CI,
+                                        /*AllowMinSizeF32=*/true,
+                                        /*AllowF64=*/true)) {
+    // rootn(x, -2) = rsqrt(x)
+
+    // The original rootn had looser ulp requirements than the resultant sqrt
+    // and fdiv.
+    MDBuilder MDHelper(M->getContext());
+    MDNode *FPMD = MDHelper.createFPMath(std::max(FPOp->getFPAccuracy(), 2.0f));
+
+    // TODO: Could handle strictfp but need to fix strict sqrt emission
+    FastMathFlags FMF = FPOp->getFastMathFlags();
+    FMF.setAllowContract(true);
+
+    CallInst *Sqrt = B.CreateUnaryIntrinsic(Intrinsic::sqrt, opr0, CI);
+    Instruction *RSqrt = cast<Instruction>(
+        B.CreateFDiv(ConstantFP::get(opr0->getType(), 1.0), Sqrt));
+    Sqrt->setFastMathFlags(FMF);
+    RSqrt->setFastMathFlags(FMF);
+    RSqrt->setMetadata(LLVMContext::MD_fpmath, FPMD);
+
+    LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> rsqrt(" << *opr0
+                      << ")\n");
+    replaceCall(CI, RSqrt);
+    return true;
+  }
+
   return false;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 56345d14a331..dbb42a60f71f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5132,6 +5132,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
       break;
     }
+    case Intrinsic::amdgcn_pops_exiting_wave_id:
+      return getDefaultMappingSOP(MI);
     default:
       return getInvalidInstructionMapping();
     }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 84320d296a03..437e01c37c6b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1129,31 +1129,56 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                            int Index, VectorType *SubTp,
                                            ArrayRef<const Value *> Args,
                                            const Instruction *CxtI) {
+  if (!isa<FixedVectorType>(VT))
+    return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
+
   Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
-  // Treat extractsubvector as single op permutation.
-  bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
-  if (IsExtractSubvector)
-    Kind = TTI::SK_PermuteSingleSrc;
-
-  if (ST->hasVOP3PInsts()) {
-    if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
-        DL.getTypeSizeInBits(VT->getElementType()) == 16) {
-      // With op_sel VOP3P instructions freely can access the low half or high
-      // half of a register, so any swizzle is free.
 
-      switch (Kind) {
-      case TTI::SK_Broadcast:
-      case TTI::SK_Reverse:
-      case TTI::SK_PermuteSingleSrc:
+  // Larger vector widths may require additional instructions, but are
+  // typically cheaper than scalarized versions.
+  unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
+  if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+      DL.getTypeSizeInBits(VT->getElementType()) == 16) {
+    bool HasVOP3P = ST->hasVOP3PInsts();
+    unsigned RequestedElts =
+        count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
+    if (RequestedElts == 0)
+      return 0;
+    switch (Kind) {
+    case TTI::SK_Broadcast:
+    case TTI::SK_Reverse:
+    case TTI::SK_PermuteSingleSrc: {
+      // With op_sel VOP3P instructions freely can access the low half or high
+      // half of a register, so any swizzle of two elements is free.
+      if (HasVOP3P && NumVectorElts == 2)
         return 0;
-      default:
-        break;
-      }
+      unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
+      // SK_Broadcast just reuses the same mask
+      unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
+      return NumPerms + NumPermMasks;
+    }
+    case TTI::SK_ExtractSubvector:
+    case TTI::SK_InsertSubvector: {
+      // Even aligned accesses are free
+      if (!(Index % 2))
+        return 0;
+      // Insert/extract subvectors only require shifts / extract code to get the
+      // relevant bits
+      return alignTo(RequestedElts, 2) / 2;
+    }
+    case TTI::SK_PermuteTwoSrc:
+    case TTI::SK_Splice:
+    case TTI::SK_Select: {
+      unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
+      // SK_Select just reuses the same mask
+      unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
+      return NumPerms + NumPermMasks;
+    }
+
+    default:
+      break;
     }
   }
-  // Restore optimal kind.
-  if (IsExtractSubvector)
-    Kind = TTI::SK_ExtractSubvector;
 
   return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
 }
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index c08c35c45984..dcd4b22f4057 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1340,7 +1340,7 @@ private:
   bool ParseDirectiveAMDGCNTarget();
   bool ParseDirectiveAMDHSACodeObjectVersion();
   bool ParseDirectiveAMDHSAKernel();
-  bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header);
+  bool ParseAMDKernelCodeTValue(StringRef ID, AMDGPUMCKernelCodeT &Header);
   bool ParseDirectiveAMDKernelCodeT();
   // TODO: Possibly make subtargetHasRegister const.
   bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo);
@@ -5863,7 +5863,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSACodeObjectVersion() {
 }
 
 bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
-                                               amd_kernel_code_t &Header) {
+                                               AMDGPUMCKernelCodeT &C) {
   // max_scratch_backing_memory_byte_size is deprecated. Ignore it while parsing
   // assembly for backwards compatibility.
   if (ID == "max_scratch_backing_memory_byte_size") {
@@ -5873,25 +5873,13 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
 
   SmallString<40> ErrStr;
   raw_svector_ostream Err(ErrStr);
-  if (!parseAmdKernelCodeField(ID, getParser(), Header, Err)) {
+  if (!C.ParseKernelCodeT(ID, getParser(), Err)) {
     return TokError(Err.str());
   }
   Lex();
 
-  if (ID == "enable_dx10_clamp") {
-    if (G_00B848_DX10_CLAMP(Header.compute_pgm_resource_registers) &&
-        isGFX12Plus())
-      return TokError("enable_dx10_clamp=1 is not allowed on GFX12+");
-  }
-
-  if (ID == "enable_ieee_mode") {
-    if (G_00B848_IEEE_MODE(Header.compute_pgm_resource_registers) &&
-        isGFX12Plus())
-      return TokError("enable_ieee_mode=1 is not allowed on GFX12+");
-  }
-
   if (ID == "enable_wavefront_size32") {
-    if (Header.code_properties & AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32) {
+    if (C.code_properties & AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32) {
       if (!isGFX10Plus())
         return TokError("enable_wavefront_size32=1 is only allowed on GFX10+");
       if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize32])
@@ -5903,41 +5891,23 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
   }
 
   if (ID == "wavefront_size") {
-    if (Header.wavefront_size == 5) {
+    if (C.wavefront_size == 5) {
       if (!isGFX10Plus())
         return TokError("wavefront_size=5 is only allowed on GFX10+");
       if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize32])
         return TokError("wavefront_size=5 requires +WavefrontSize32");
-    } else if (Header.wavefront_size == 6) {
+    } else if (C.wavefront_size == 6) {
       if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize64])
         return TokError("wavefront_size=6 requires +WavefrontSize64");
     }
   }
 
-  if (ID == "enable_wgp_mode") {
-    if (G_00B848_WGP_MODE(Header.compute_pgm_resource_registers) &&
-        !isGFX10Plus())
-      return TokError("enable_wgp_mode=1 is only allowed on GFX10+");
-  }
-
-  if (ID == "enable_mem_ordered") {
-    if (G_00B848_MEM_ORDERED(Header.compute_pgm_resource_registers) &&
-        !isGFX10Plus())
-      return TokError("enable_mem_ordered=1 is only allowed on GFX10+");
-  }
-
-  if (ID == "enable_fwd_progress") {
-    if (G_00B848_FWD_PROGRESS(Header.compute_pgm_resource_registers) &&
-        !isGFX10Plus())
-      return TokError("enable_fwd_progress=1 is only allowed on GFX10+");
-  }
-
   return false;
 }
 
 bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
-  amd_kernel_code_t Header;
-  AMDGPU::initDefaultAMDKernelCodeT(Header, &getSTI());
+  AMDGPUMCKernelCodeT KernelCode;
+  KernelCode.initDefault(&getSTI(), getContext());
 
   while (true) {
     // Lex EndOfStatement.  This is in a while loop, because lexing a comment
@@ -5951,11 +5921,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
     if (ID == ".end_amd_kernel_code_t")
       break;
 
-    if (ParseAMDKernelCodeTValue(ID, Header))
+    if (ParseAMDKernelCodeTValue(ID, KernelCode))
       return true;
   }
 
-  getTargetStreamer().EmitAMDKernelCodeT(Header);
+  KernelCode.validate(&getSTI(), getContext());
+  getTargetStreamer().EmitAMDKernelCodeT(KernelCode);
 
   return false;
 }
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index b7548671f2c5..db5b467f2238 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1312,6 +1312,9 @@ public:
   // \returns true if the target has IEEE fminimum/fmaximum instructions
   bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
 
+  // \returns true if the target has IEEE fminimum3/fmaximum3 instructions
+  bool hasIEEEMinMax3() const { return hasIEEEMinMax(); }
+
   // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
   bool hasRrWGMode() const { return getGeneration() >= GFX12; }
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 02fe7be06280..00e64e3419ba 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -13,7 +13,6 @@
 #include "AMDGPUTargetStreamer.h"
 #include "AMDGPUMCKernelDescriptor.h"
 #include "AMDGPUPTNote.h"
-#include "AMDKernelCodeT.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "Utils/AMDKernelCodeTUtils.h"
 #include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
@@ -240,10 +239,9 @@ void AMDGPUTargetAsmStreamer::EmitDirectiveAMDHSACodeObjectVersion(
   OS << "\t.amdhsa_code_object_version " << COV << '\n';
 }
 
-void
-AMDGPUTargetAsmStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
+void AMDGPUTargetAsmStreamer::EmitAMDKernelCodeT(AMDGPUMCKernelCodeT &Header) {
   OS << "\t.amd_kernel_code_t\n";
-  dumpAmdKernelCode(&Header, OS, "\t\t");
+  Header.EmitKernelCodeT(OS, getContext());
   OS << "\t.end_amd_kernel_code_t\n";
 }
 
@@ -789,12 +787,10 @@ unsigned AMDGPUTargetELFStreamer::getEFlagsV6() {
 
 void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget() {}
 
-void
-AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
-
+void AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(AMDGPUMCKernelCodeT &Header) {
   MCStreamer &OS = getStreamer();
   OS.pushSection();
-  OS.emitBytes(StringRef((const char*)&Header, sizeof(Header)));
+  Header.EmitKernelCodeT(OS, getContext());
   OS.popSection();
 }
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 706897a5dc1f..e5c90060cb5d 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -13,8 +13,6 @@
 #include "Utils/AMDGPUPALMetadata.h"
 #include "llvm/MC/MCStreamer.h"
 
-struct amd_kernel_code_t;
-
 namespace llvm {
 
 class MCELFStreamer;
@@ -23,6 +21,7 @@ class formatted_raw_ostream;
 
 namespace AMDGPU {
 
+struct AMDGPUMCKernelCodeT;
 struct MCKernelDescriptor;
 namespace HSAMD {
 struct Metadata;
@@ -54,7 +53,7 @@ public:
     CodeObjectVersion = COV;
   }
 
-  virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header){};
+  virtual void EmitAMDKernelCodeT(AMDGPU::AMDGPUMCKernelCodeT &Header) {};
 
   virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type){};
 
@@ -130,7 +129,7 @@ public:
 
   void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV) override;
 
-  void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
+  void EmitAMDKernelCodeT(AMDGPU::AMDGPUMCKernelCodeT &Header) override;
 
   void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
 
@@ -186,7 +185,7 @@ public:
 
   void EmitDirectiveAMDGCNTarget() override;
 
-  void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
+  void EmitAMDKernelCodeT(AMDGPU::AMDGPUMCKernelCodeT &Header) override;
 
   void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
 
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 6d0e0b3f4de2..1e9bfc77ab92 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -1111,7 +1111,7 @@ enum Type { TRAP = -2, WORKGROUP = -1 };
 #define   C_00B84C_LDS_SIZE                                           0xFF007FFF
 #define   S_00B84C_EXCP_EN(x)                                         (((x) & 0x7F) << 24)
 #define   G_00B84C_EXCP_EN(x)                                         (((x) >> 24) & 0x7F)
-#define   C_00B84C_EXCP_EN
+#define   C_00B84C_EXCP_EN                                            0x80FFFFFF
 
 #define R_0286CC_SPI_PS_INPUT_ENA                                       0x0286CC
 #define R_0286D0_SPI_PS_INPUT_ADDR                                      0x0286D0
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 42e1c1ce764c..1d2a5fff2356 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -957,6 +957,11 @@ const GCNSubtarget *SITargetLowering::getSubtarget() const {
   return Subtarget;
 }
 
+ArrayRef<MCPhysReg> SITargetLowering::getRoundingControlRegisters() const {
+  static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
+  return RCRegs;
+}
+
 //===----------------------------------------------------------------------===//
 // TargetLowering queries
 //===----------------------------------------------------------------------===//
@@ -7588,8 +7593,7 @@ static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result,
                           ? (ReqRetNumElts + 1) / 2
                           : ReqRetNumElts;
 
-  int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
-    DMaskPop : (DMaskPop + 1) / 2;
+  int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
 
   MVT DataDwordVT = NumDataDwords == 1 ?
     MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
@@ -13195,6 +13199,33 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
   return SDValue();
 }
 
+/// \return true if the subtarget supports minimum3 and maximum3 with the given
+/// base min/max opcode \p Opc for type \p VT.
+static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
+                             EVT VT) {
+  switch (Opc) {
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FMINNUM_IEEE:
+  case ISD::FMAXNUM_IEEE:
+  case AMDGPUISD::FMIN_LEGACY:
+  case AMDGPUISD::FMAX_LEGACY:
+    return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
+  case ISD::FMINIMUM:
+  case ISD::FMAXIMUM:
+    return (VT == MVT::f32 || VT == MVT::f16) && Subtarget.hasIEEEMinMax3();
+  case ISD::SMAX:
+  case ISD::SMIN:
+  case ISD::UMAX:
+  case ISD::UMIN:
+    return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
+  default:
+    return false;
+  }
+
+  llvm_unreachable("not a min/max opcode");
+}
+
 SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -13207,10 +13238,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
   // Only do this if the inner op has one use since this will just increases
   // register pressure for no benefit.
 
-  if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
-      !VT.isVector() &&
-      (VT == MVT::i32 || VT == MVT::f32 ||
-       ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) {
+  if (supportsMin3Max3(*Subtarget, Opc, VT)) {
     // max(max(a, b), c) -> max3(a, b, c)
     // min(min(a, b), c) -> min3(a, b, c)
     if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 08aa2a599163..fed73f48840f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -287,6 +287,8 @@ public:
 
   const GCNSubtarget *getSubtarget() const;
 
+  ArrayRef<MCPhysReg> getRoundingControlRegisters() const override;
+
   bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT,
                        EVT SrcVT) const override;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 08351c49b223..bb5f166e4792 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2031,50 +2031,57 @@ MachineBasicBlock *SIInstrInfo::insertSimulatedTrap(MachineRegisterInfo &MRI,
                                                     MachineInstr &MI,
                                                     const DebugLoc &DL) const {
   MachineFunction *MF = MBB.getParent();
-  MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
-  MachineBasicBlock *HaltLoop = MF->CreateMachineBasicBlock();
-  MF->push_back(HaltLoop);
-
   constexpr unsigned DoorbellIDMask = 0x3ff;
   constexpr unsigned ECQueueWaveAbort = 0x400;
 
+  MachineBasicBlock *TrapBB = &MBB;
+  MachineBasicBlock *ContBB = &MBB;
+  MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
+
+  if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
+    ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
+    TrapBB = MF->CreateMachineBasicBlock();
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
+    MF->push_back(TrapBB);
+    MBB.addSuccessor(TrapBB);
+  }
+
   // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
   // will be a nop.
-  BuildMI(MBB, MI, DL, get(AMDGPU::S_TRAP))
+  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
   Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-  BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG_RTN_B32), DoorbellReg)
+  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
+          DoorbellReg)
       .addImm(AMDGPU::SendMsg::ID_RTN_GET_DOORBELL);
-  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
+  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
       .addUse(AMDGPU::M0);
   Register DoorbellRegMasked =
       MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-  BuildMI(MBB, MI, DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
+  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
       .addUse(DoorbellReg)
       .addImm(DoorbellIDMask);
   Register SetWaveAbortBit =
       MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-  BuildMI(MBB, MI, DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
+  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
       .addUse(DoorbellRegMasked)
       .addImm(ECQueueWaveAbort);
-  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
       .addUse(SetWaveAbortBit);
-  BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG))
+  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
       .addImm(AMDGPU::SendMsg::ID_INTERRUPT);
-  BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
       .addUse(AMDGPU::TTMP2);
-  BuildMI(MBB, MI, DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoop);
-
-  BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
-  BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_BRANCH))
-      .addMBB(HaltLoop);
+  BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
+  TrapBB->addSuccessor(HaltLoopBB);
 
-  if (SplitBB != &MBB)
-    MBB.removeSuccessor(SplitBB);
-  MBB.addSuccessor(HaltLoop);
-  HaltLoop->addSuccessor(HaltLoop);
+  BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
+  BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
+      .addMBB(HaltLoopBB);
+  MF->push_back(HaltLoopBB);
+  HaltLoopBB->addSuccessor(HaltLoopBB);
 
-  return SplitBB;
+  return ContBB;
 }
 
 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 2beaf903542b..4b34fb27632a 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -11,6 +11,7 @@
 #include "AMDGPUAsmUtils.h"
 #include "AMDKernelCodeT.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "Utils/AMDKernelCodeTUtils.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/IR/Attributes.h"
@@ -1218,39 +1219,37 @@ unsigned getAllocatedNumVGPRBlocks(const MCSubtargetInfo *STI,
 }
 } // end namespace IsaInfo
 
-void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
+void initDefaultAMDKernelCodeT(AMDGPUMCKernelCodeT &KernelCode,
                                const MCSubtargetInfo *STI) {
   IsaVersion Version = getIsaVersion(STI->getCPU());
-
-  memset(&Header, 0, sizeof(Header));
-
-  Header.amd_kernel_code_version_major = 1;
-  Header.amd_kernel_code_version_minor = 2;
-  Header.amd_machine_kind = 1; // AMD_MACHINE_KIND_AMDGPU
-  Header.amd_machine_version_major = Version.Major;
-  Header.amd_machine_version_minor = Version.Minor;
-  Header.amd_machine_version_stepping = Version.Stepping;
-  Header.kernel_code_entry_byte_offset = sizeof(Header);
-  Header.wavefront_size = 6;
+  KernelCode.amd_kernel_code_version_major = 1;
+  KernelCode.amd_kernel_code_version_minor = 2;
+  KernelCode.amd_machine_kind = 1; // AMD_MACHINE_KIND_AMDGPU
+  KernelCode.amd_machine_version_major = Version.Major;
+  KernelCode.amd_machine_version_minor = Version.Minor;
+  KernelCode.amd_machine_version_stepping = Version.Stepping;
+  KernelCode.kernel_code_entry_byte_offset = sizeof(amd_kernel_code_t);
+  if (STI->getFeatureBits().test(FeatureWavefrontSize32)) {
+    KernelCode.wavefront_size = 5;
+    KernelCode.code_properties |= AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
+  } else {
+    KernelCode.wavefront_size = 6;
+  }
 
   // If the code object does not support indirect functions, then the value must
   // be 0xffffffff.
-  Header.call_convention = -1;
+  KernelCode.call_convention = -1;
 
   // These alignment values are specified in powers of two, so alignment =
   // 2^n.  The minimum alignment is 2^4 = 16.
-  Header.kernarg_segment_alignment = 4;
-  Header.group_segment_alignment = 4;
-  Header.private_segment_alignment = 4;
+  KernelCode.kernarg_segment_alignment = 4;
+  KernelCode.group_segment_alignment = 4;
+  KernelCode.private_segment_alignment = 4;
 
   if (Version.Major >= 10) {
-    if (STI->getFeatureBits().test(FeatureWavefrontSize32)) {
-      Header.wavefront_size = 5;
-      Header.code_properties |= AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
-    }
-    Header.compute_pgm_resource_registers |=
-      S_00B848_WGP_MODE(STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1) |
-      S_00B848_MEM_ORDERED(1);
+    KernelCode.compute_pgm_resource_registers |=
+        S_00B848_WGP_MODE(STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1) |
+        S_00B848_MEM_ORDERED(1);
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index fc4147df76e3..3cfc42a7d24d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -37,6 +37,7 @@ class raw_ostream;
 
 namespace AMDGPU {
 
+struct AMDGPUMCKernelCodeT;
 struct IsaVersion;
 
 /// Generic target versions emitted by this version of LLVM.
@@ -860,7 +861,7 @@ unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc);
 LLVM_READONLY
 unsigned mapWMMA3AddrTo2AddrOpcode(unsigned Opc);
 
-void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
+void initDefaultAMDKernelCodeT(AMDGPUMCKernelCodeT &Header,
                                const MCSubtargetInfo *STI);
 
 bool isGroupSegment(const GlobalValue *GV);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
index 95ad3f35d18f..75cb6cffbd51 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
@@ -12,34 +12,51 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define QNAME(name) amd_kernel_code_t::name
+#define QNAME(name) AMDGPUMCKernelCodeT::name
 #define FLD_T(name) decltype(QNAME(name)), &QNAME(name)
 
-#define FIELD2(sname, aname, name) \
-  RECORD(sname, aname, printField<FLD_T(name)>, parseField<FLD_T(name)>)
+#ifndef PRINTFIELD
+#define PRINTFIELD(sname, aname, name) printField<FLD_T(name)>
+#endif
 
-#define FIELD(name) FIELD2(name, name, name)
+#ifndef FIELD2
+#define FIELD2(sname, aname, name)                                             \
+  RECORD(sname, aname, PRINTFIELD(sname, aname, name), parseField<FLD_T(name)>)
+#endif
 
+#ifndef FIELD
+#define FIELD(name) FIELD2(name, name, name)
+#endif
 
+#ifndef PRINTCODEPROP
 #define PRINTCODEPROP(name) \
   printBitField<FLD_T(code_properties),\
                 AMD_CODE_PROPERTY_##name##_SHIFT,\
                 AMD_CODE_PROPERTY_##name##_WIDTH>
+#endif
 
+#ifndef PARSECODEPROP
 #define PARSECODEPROP(name) \
   parseBitField<FLD_T(code_properties),\
                 AMD_CODE_PROPERTY_##name##_SHIFT,\
                 AMD_CODE_PROPERTY_##name##_WIDTH>
+#endif
 
+#ifndef CODEPROP
 #define CODEPROP(name, shift) \
   RECORD(name, name, PRINTCODEPROP(shift), PARSECODEPROP(shift))
+#endif
 
 // have to define these lambdas because of Set/GetMacro
+#ifndef PRINTCOMP
 #define PRINTCOMP(GetMacro, Shift) \
 [](StringRef Name, const amd_kernel_code_t &C, raw_ostream &OS) { \
    printName(OS, Name) << \
      (int)GetMacro(C.compute_pgm_resource_registers >> Shift); \
 }
+#endif
+
+#ifndef PARSECOMP
 #define PARSECOMP(SetMacro, Shift) \
 [](amd_kernel_code_t &C, MCAsmParser &MCParser, raw_ostream &Err) { \
    int64_t Value = 0; \
@@ -49,15 +66,22 @@
    C.compute_pgm_resource_registers |= SetMacro(Value) << Shift; \
    return true; \
 }
+#endif
 
+#ifndef COMPPGM
 #define COMPPGM(name, aname, GetMacro, SetMacro, Shift) \
   RECORD(name, aname, PRINTCOMP(GetMacro, Shift), PARSECOMP(SetMacro, Shift))
+#endif
 
+#ifndef COMPPGM1
 #define COMPPGM1(name, aname, AccMacro) \
   COMPPGM(name, aname, G_00B848_##AccMacro, S_00B848_##AccMacro, 0)
+#endif
 
+#ifndef COMPPGM2
 #define COMPPGM2(name, aname, AccMacro) \
   COMPPGM(name, aname, G_00B84C_##AccMacro, S_00B84C_##AccMacro, 32)
+#endif
 
 ///////////////////////////////////////////////////////////////////////////////
 // Begin of the table
@@ -143,13 +167,14 @@ FIELD(runtime_loader_kernel_symbol)
 
 #undef QNAME
 #undef FLD_T
+#undef PRINTFIELD
 #undef FIELD2
 #undef FIELD
 #undef PRINTCODEPROP
 #undef PARSECODEPROP
 #undef CODEPROP
 #undef PRINTCOMP
-#undef PAPSECOMP
+#undef PARSECOMP
 #undef COMPPGM
 #undef COMPPGM1
 #undef COMPPGM2
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
index 6bbc8c315718..eaee1a2a9739 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp
@@ -6,44 +6,205 @@
 //
 //===----------------------------------------------------------------------===//
 //
-/// \file - utility functions to parse/print amd_kernel_code_t structure
+/// \file - utility functions to parse/print AMDGPUMCKernelCodeT structure
 //
 //===----------------------------------------------------------------------===//
 
 #include "AMDKernelCodeTUtils.h"
 #include "AMDKernelCodeT.h"
 #include "SIDefines.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringMap.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
+using namespace llvm::AMDGPU;
 
-static ArrayRef<StringRef> get_amd_kernel_code_t_FldNames() {
-  static StringRef const Table[] = {
-    "", // not found placeholder
+// Generates the following for AMDGPUMCKernelCodeT struct members:
+//   - HasMemberXXXXX class
+//     A check to see if AMDGPUMCKernelCodeT has a specific member so it can
+//     determine which of the original amd_kernel_code_t members are duplicated
+//     (if the names don't match, the table driven strategy won't work).
+//   - IsMCExprXXXXX class
+//     Check whether a AMDGPUMCKernelcodeT struct member is MCExpr-ified or not.
+//   - GetMemberXXXXX class
+//     A retrieval helper for said member (of type const MCExpr *&). Will return
+//     a `Phony` const MCExpr * initialized to nullptr to preserve reference
+//     returns.
+#define GEN_HAS_MEMBER(member)                                                 \
+  class HasMember##member {                                                    \
+  private:                                                                     \
+    struct KnownWithMember {                                                   \
+      int member;                                                              \
+    };                                                                         \
+    class AmbiguousDerived : public AMDGPUMCKernelCodeT,                       \
+                             public KnownWithMember {};                        \
+    template <typename U>                                                      \
+    static constexpr std::false_type Test(decltype(U::member) *);              \
+    template <typename U> static constexpr std::true_type Test(...);           \
+                                                                               \
+  public:                                                                      \
+    static constexpr bool RESULT =                                             \
+        std::is_same_v<decltype(Test<AmbiguousDerived>(nullptr)),              \
+                       std::true_type>;                                        \
+  };                                                                           \
+  class IsMCExpr##member {                                                     \
+    template <typename U,                                                      \
+              typename std::enable_if_t<                                       \
+                  HasMember##member::RESULT &&                                 \
+                      std::is_same_v<decltype(U::member), const MCExpr *>,     \
+                  U> * = nullptr>                                              \
+    static constexpr std::true_type HasMCExprType(decltype(U::member) *);      \
+    template <typename U> static constexpr std::false_type HasMCExprType(...); \
+                                                                               \
+  public:                                                                      \
+    static constexpr bool RESULT =                                             \
+        std::is_same_v<decltype(HasMCExprType<AMDGPUMCKernelCodeT>(nullptr)),  \
+                       std::true_type>;                                        \
+  };                                                                           \
+  class GetMember##member {                                                    \
+  public:                                                                      \
+    static const MCExpr *Phony;                                                \
+    template <typename U, typename std::enable_if_t<IsMCExpr##member::RESULT,  \
+                                                    U> * = nullptr>            \
+    static const MCExpr *&Get(U &C) {                                          \
+      assert(IsMCExpr##member::RESULT &&                                       \
+             "Trying to retrieve member that does not exist.");                \
+      return C.member;                                                         \
+    }                                                                          \
+    template <typename U, typename std::enable_if_t<!IsMCExpr##member::RESULT, \
+                                                    U> * = nullptr>            \
+    static const MCExpr *&Get(U &C) {                                          \
+      return Phony;                                                            \
+    }                                                                          \
+  };                                                                           \
+  const MCExpr *GetMember##member::Phony = nullptr;
+
+// Cannot generate class declarations using the table driver approach (see table
+// in AMDKernelCodeTInfo.h). Luckily, if any are missing here or eventually
+// added to the table, an error should occur when trying to retrieve the table
+// in getMCExprIndexTable.
+GEN_HAS_MEMBER(amd_code_version_major)
+GEN_HAS_MEMBER(amd_code_version_minor)
+GEN_HAS_MEMBER(amd_machine_kind)
+GEN_HAS_MEMBER(amd_machine_version_major)
+GEN_HAS_MEMBER(amd_machine_version_minor)
+GEN_HAS_MEMBER(amd_machine_version_stepping)
+
+GEN_HAS_MEMBER(kernel_code_entry_byte_offset)
+GEN_HAS_MEMBER(kernel_code_prefetch_byte_size)
+
+GEN_HAS_MEMBER(granulated_workitem_vgpr_count)
+GEN_HAS_MEMBER(granulated_wavefront_sgpr_count)
+GEN_HAS_MEMBER(priority)
+GEN_HAS_MEMBER(float_mode)
+GEN_HAS_MEMBER(priv)
+GEN_HAS_MEMBER(enable_dx10_clamp)
+GEN_HAS_MEMBER(debug_mode)
+GEN_HAS_MEMBER(enable_ieee_mode)
+GEN_HAS_MEMBER(enable_wgp_mode)
+GEN_HAS_MEMBER(enable_mem_ordered)
+GEN_HAS_MEMBER(enable_fwd_progress)
+
+GEN_HAS_MEMBER(enable_sgpr_private_segment_wave_byte_offset)
+GEN_HAS_MEMBER(user_sgpr_count)
+GEN_HAS_MEMBER(enable_trap_handler)
+GEN_HAS_MEMBER(enable_sgpr_workgroup_id_x)
+GEN_HAS_MEMBER(enable_sgpr_workgroup_id_y)
+GEN_HAS_MEMBER(enable_sgpr_workgroup_id_z)
+GEN_HAS_MEMBER(enable_sgpr_workgroup_info)
+GEN_HAS_MEMBER(enable_vgpr_workitem_id)
+GEN_HAS_MEMBER(enable_exception_msb)
+GEN_HAS_MEMBER(granulated_lds_size)
+GEN_HAS_MEMBER(enable_exception)
+
+GEN_HAS_MEMBER(enable_sgpr_private_segment_buffer)
+GEN_HAS_MEMBER(enable_sgpr_dispatch_ptr)
+GEN_HAS_MEMBER(enable_sgpr_queue_ptr)
+GEN_HAS_MEMBER(enable_sgpr_kernarg_segment_ptr)
+GEN_HAS_MEMBER(enable_sgpr_dispatch_id)
+GEN_HAS_MEMBER(enable_sgpr_flat_scratch_init)
+GEN_HAS_MEMBER(enable_sgpr_private_segment_size)
+GEN_HAS_MEMBER(enable_sgpr_grid_workgroup_count_x)
+GEN_HAS_MEMBER(enable_sgpr_grid_workgroup_count_y)
+GEN_HAS_MEMBER(enable_sgpr_grid_workgroup_count_z)
+GEN_HAS_MEMBER(enable_wavefront_size32)
+GEN_HAS_MEMBER(enable_ordered_append_gds)
+GEN_HAS_MEMBER(private_element_size)
+GEN_HAS_MEMBER(is_ptr64)
+GEN_HAS_MEMBER(is_dynamic_callstack)
+GEN_HAS_MEMBER(is_debug_enabled)
+GEN_HAS_MEMBER(is_xnack_enabled)
+
+GEN_HAS_MEMBER(workitem_private_segment_byte_size)
+GEN_HAS_MEMBER(workgroup_group_segment_byte_size)
+GEN_HAS_MEMBER(gds_segment_byte_size)
+GEN_HAS_MEMBER(kernarg_segment_byte_size)
+GEN_HAS_MEMBER(workgroup_fbarrier_count)
+GEN_HAS_MEMBER(wavefront_sgpr_count)
+GEN_HAS_MEMBER(workitem_vgpr_count)
+GEN_HAS_MEMBER(reserved_vgpr_first)
+GEN_HAS_MEMBER(reserved_vgpr_count)
+GEN_HAS_MEMBER(reserved_sgpr_first)
+GEN_HAS_MEMBER(reserved_sgpr_count)
+GEN_HAS_MEMBER(debug_wavefront_private_segment_offset_sgpr)
+GEN_HAS_MEMBER(debug_private_segment_buffer_sgpr)
+GEN_HAS_MEMBER(kernarg_segment_alignment)
+GEN_HAS_MEMBER(group_segment_alignment)
+GEN_HAS_MEMBER(private_segment_alignment)
+GEN_HAS_MEMBER(wavefront_size)
+GEN_HAS_MEMBER(call_convention)
+GEN_HAS_MEMBER(runtime_loader_kernel_symbol)
+
+static ArrayRef<StringLiteral> get_amd_kernel_code_t_FldNames() {
+  static constexpr StringLiteral const Table[] = {
+      "", // not found placeholder
 #define RECORD(name, altName, print, parse) #name
-#include "AMDKernelCodeTInfo.h"
+#include "Utils/AMDKernelCodeTInfo.h"
 #undef RECORD
   };
   return ArrayRef(Table);
 }
 
-static ArrayRef<StringRef> get_amd_kernel_code_t_FldAltNames() {
-  static StringRef const Table[] = {
-    "", // not found placeholder
+static ArrayRef<StringLiteral> get_amd_kernel_code_t_FldAltNames() {
+  static constexpr StringLiteral const Table[] = {
+      "", // not found placeholder
 #define RECORD(name, altName, print, parse) #altName
-#include "AMDKernelCodeTInfo.h"
+#include "Utils/AMDKernelCodeTInfo.h"
+#undef RECORD
+  };
+  return ArrayRef(Table);
+}
+
+static ArrayRef<bool> hasMCExprVersionTable() {
+  static bool const Table[] = {
+#define RECORD(name, altName, print, parse) (IsMCExpr##name::RESULT)
+#include "Utils/AMDKernelCodeTInfo.h"
 #undef RECORD
   };
   return ArrayRef(Table);
 }
 
-static StringMap<int> createIndexMap(const ArrayRef<StringRef> &names,
-                                     const ArrayRef<StringRef> &altNames) {
+using RetrieveFx = const MCExpr *&(*)(AMDGPUMCKernelCodeT &);
+
+static ArrayRef<RetrieveFx> getMCExprIndexTable() {
+  static const RetrieveFx Table[] = {
+#define RECORD(name, altName, print, parse) GetMember##name::Get
+#include "Utils/AMDKernelCodeTInfo.h"
+#undef RECORD
+  };
+  return ArrayRef(Table);
+}
+
+static StringMap<int> createIndexMap(ArrayRef<StringLiteral> names,
+                                     ArrayRef<StringLiteral> altNames) {
   StringMap<int> map;
   assert(names.size() == altNames.size());
   for (unsigned i = 0; i < names.size(); ++i) {
@@ -59,62 +220,111 @@ static int get_amd_kernel_code_t_FieldIndex(StringRef name) {
   return map.lookup(name) - 1; // returns -1 if not found
 }
 
-static StringRef get_amd_kernel_code_t_FieldName(int index) {
-  return get_amd_kernel_code_t_FldNames()[index + 1];
-}
+static constexpr std::pair<unsigned, unsigned> getShiftMask(unsigned Value) {
+  unsigned Shift = 0;
+  unsigned Mask = 0;
 
-// Field printing
+  Mask = ~Value;
+  for (; !(Mask & 1); Shift++, Mask >>= 1) {
+  }
 
-static raw_ostream &printName(raw_ostream &OS, StringRef Name) {
-  return OS << Name << " = ";
+  return std::make_pair(Shift, Mask);
 }
 
-template <typename T, T amd_kernel_code_t::*ptr>
-static void printField(StringRef Name, const amd_kernel_code_t &C,
-                       raw_ostream &OS) {
-  printName(OS, Name) << (int)(C.*ptr);
+static const MCExpr *MaskShiftSet(const MCExpr *Val, uint32_t Mask,
+                                  uint32_t Shift, MCContext &Ctx) {
+  if (Mask) {
+    const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx);
+    Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx);
+  }
+  if (Shift) {
+    const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx);
+    Val = MCBinaryExpr::createShl(Val, ShiftExpr, Ctx);
+  }
+  return Val;
 }
 
-template <typename T, T amd_kernel_code_t::*ptr, int shift, int width = 1>
-static void printBitField(StringRef Name, const amd_kernel_code_t &c,
-                          raw_ostream &OS) {
+static const MCExpr *MaskShiftGet(const MCExpr *Val, uint32_t Mask,
+                                  uint32_t Shift, MCContext &Ctx) {
+  if (Shift) {
+    const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx);
+    Val = MCBinaryExpr::createLShr(Val, ShiftExpr, Ctx);
+  }
+  if (Mask) {
+    const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx);
+    Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx);
+  }
+  return Val;
+}
+
+class PrintField {
+public:
+  template <typename T, T AMDGPUMCKernelCodeT::*ptr,
+            typename std::enable_if_t<!std::is_integral_v<T>, T> * = nullptr>
+  static void printField(StringRef Name, const AMDGPUMCKernelCodeT &C,
+                         raw_ostream &OS, MCContext &Ctx) {
+    OS << Name << " = ";
+    const MCExpr *Value = C.*ptr;
+    int64_t Val;
+    if (Value->evaluateAsAbsolute(Val))
+      OS << Val;
+    else
+      Value->print(OS, Ctx.getAsmInfo());
+  }
+
+  template <typename T, T AMDGPUMCKernelCodeT::*ptr,
+            typename std::enable_if_t<std::is_integral_v<T>, T> * = nullptr>
+  static void printField(StringRef Name, const AMDGPUMCKernelCodeT &C,
+                         raw_ostream &OS, MCContext &) {
+    OS << Name << " = " << (int)(C.*ptr);
+  }
+};
+
+template <typename T, T AMDGPUMCKernelCodeT::*ptr, int shift, int width = 1>
+static void printBitField(StringRef Name, const AMDGPUMCKernelCodeT &C,
+                          raw_ostream &OS, MCContext &) {
   const auto Mask = (static_cast<T>(1) << width) - 1;
-  printName(OS, Name) << (int)((c.*ptr >> shift) & Mask);
+  OS << Name << " = " << (int)((C.*ptr >> shift) & Mask);
 }
 
-using PrintFx = void(*)(StringRef, const amd_kernel_code_t &, raw_ostream &);
+using PrintFx = void (*)(StringRef, const AMDGPUMCKernelCodeT &, raw_ostream &,
+                         MCContext &);
 
 static ArrayRef<PrintFx> getPrinterTable() {
   static const PrintFx Table[] = {
+#define COMPPGM1(name, aname, AccMacro)                                        \
+  COMPPGM(name, aname, C_00B848_##AccMacro, S_00B848_##AccMacro, 0)
+#define COMPPGM2(name, aname, AccMacro)                                        \
+  COMPPGM(name, aname, C_00B84C_##AccMacro, S_00B84C_##AccMacro, 32)
+#define PRINTFIELD(sname, aname, name) PrintField::printField<FLD_T(name)>
+#define PRINTCOMP(Complement, PGMType)                                         \
+  [](StringRef Name, const AMDGPUMCKernelCodeT &C, raw_ostream &OS,            \
+     MCContext &Ctx) {                                                         \
+    OS << Name << " = ";                                                       \
+    auto [Shift, Mask] = getShiftMask(Complement);                             \
+    const MCExpr *Value;                                                       \
+    if (PGMType == 0) {                                                        \
+      Value =                                                                  \
+          MaskShiftGet(C.compute_pgm_resource1_registers, Mask, Shift, Ctx);   \
+    } else {                                                                   \
+      Value =                                                                  \
+          MaskShiftGet(C.compute_pgm_resource2_registers, Mask, Shift, Ctx);   \
+    }                                                                          \
+    int64_t Val;                                                               \
+    if (Value->evaluateAsAbsolute(Val))                                        \
+      OS << Val;                                                               \
+    else                                                                       \
+      Value->print(OS, Ctx.getAsmInfo());                                      \
+  }
 #define RECORD(name, altName, print, parse) print
-#include "AMDKernelCodeTInfo.h"
+#include "Utils/AMDKernelCodeTInfo.h"
 #undef RECORD
   };
   return ArrayRef(Table);
 }
 
-void llvm::printAmdKernelCodeField(const amd_kernel_code_t &C,
-                                   int FldIndex,
-                                   raw_ostream &OS) {
-  auto Printer = getPrinterTable()[FldIndex];
-  if (Printer)
-    Printer(get_amd_kernel_code_t_FieldName(FldIndex), C, OS);
-}
-
-void llvm::dumpAmdKernelCode(const amd_kernel_code_t *C,
-                             raw_ostream &OS,
-                             const char *tab) {
-  const int Size = getPrinterTable().size();
-  for (int i = 0; i < Size; ++i) {
-    OS << tab;
-    printAmdKernelCodeField(*C, i, OS);
-    OS << '\n';
-  }
-}
-
-// Field parsing
-
-static bool expectAbsExpression(MCAsmParser &MCParser, int64_t &Value, raw_ostream& Err) {
+static bool expectAbsExpression(MCAsmParser &MCParser, int64_t &Value,
+                                raw_ostream &Err) {
 
   if (MCParser.getLexer().isNot(AsmToken::Equal)) {
     Err << "expected '='";
@@ -129,8 +339,8 @@ static bool expectAbsExpression(MCAsmParser &MCParser, int64_t &Value, raw_ostre
   return true;
 }
 
-template <typename T, T amd_kernel_code_t::*ptr>
-static bool parseField(amd_kernel_code_t &C, MCAsmParser &MCParser,
+template <typename T, T AMDGPUMCKernelCodeT::*ptr>
+static bool parseField(AMDGPUMCKernelCodeT &C, MCAsmParser &MCParser,
                        raw_ostream &Err) {
   int64_t Value = 0;
   if (!expectAbsExpression(MCParser, Value, Err))
@@ -139,39 +349,241 @@ static bool parseField(amd_kernel_code_t &C, MCAsmParser &MCParser,
   return true;
 }
 
-template <typename T, T amd_kernel_code_t::*ptr, int shift, int width = 1>
-static bool parseBitField(amd_kernel_code_t &C, MCAsmParser &MCParser,
+template <typename T, T AMDGPUMCKernelCodeT::*ptr, int shift, int width = 1>
+static bool parseBitField(AMDGPUMCKernelCodeT &C, MCAsmParser &MCParser,
                           raw_ostream &Err) {
   int64_t Value = 0;
   if (!expectAbsExpression(MCParser, Value, Err))
     return false;
-  const uint64_t Mask = ((UINT64_C(1)  << width) - 1) << shift;
+  const uint64_t Mask = ((UINT64_C(1) << width) - 1) << shift;
   C.*ptr &= (T)~Mask;
   C.*ptr |= (T)((Value << shift) & Mask);
   return true;
 }
 
-using ParseFx = bool(*)(amd_kernel_code_t &, MCAsmParser &MCParser,
-                        raw_ostream &Err);
+static bool parseExpr(MCAsmParser &MCParser, const MCExpr *&Value,
+                      raw_ostream &Err) {
+  if (MCParser.getLexer().isNot(AsmToken::Equal)) {
+    Err << "expected '='";
+    return false;
+  }
+  MCParser.getLexer().Lex();
+
+  if (MCParser.parseExpression(Value)) {
+    Err << "Could not parse expression";
+    return false;
+  }
+  return true;
+}
+
+using ParseFx = bool (*)(AMDGPUMCKernelCodeT &, MCAsmParser &, raw_ostream &);
 
 static ArrayRef<ParseFx> getParserTable() {
   static const ParseFx Table[] = {
+#define COMPPGM1(name, aname, AccMacro)                                        \
+  COMPPGM(name, aname, G_00B848_##AccMacro, C_00B848_##AccMacro, 0)
+#define COMPPGM2(name, aname, AccMacro)                                        \
+  COMPPGM(name, aname, G_00B84C_##AccMacro, C_00B84C_##AccMacro, 32)
+#define PARSECOMP(Complement, PGMType)                                         \
+  [](AMDGPUMCKernelCodeT &C, MCAsmParser &MCParser,                            \
+     raw_ostream &Err) -> bool {                                               \
+    MCContext &Ctx = MCParser.getContext();                                    \
+    const MCExpr *Value;                                                       \
+    if (!parseExpr(MCParser, Value, Err))                                      \
+      return false;                                                            \
+    auto [Shift, Mask] = getShiftMask(Complement);                             \
+    Value = MaskShiftSet(Value, Mask, Shift, Ctx);                             \
+    const MCExpr *Compl = MCConstantExpr::create(Complement, Ctx);             \
+    if (PGMType == 0) {                                                        \
+      C.compute_pgm_resource1_registers = MCBinaryExpr::createAnd(             \
+          C.compute_pgm_resource1_registers, Compl, Ctx);                      \
+      C.compute_pgm_resource1_registers = MCBinaryExpr::createOr(              \
+          C.compute_pgm_resource1_registers, Value, Ctx);                      \
+    } else {                                                                   \
+      C.compute_pgm_resource2_registers = MCBinaryExpr::createAnd(             \
+          C.compute_pgm_resource2_registers, Compl, Ctx);                      \
+      C.compute_pgm_resource2_registers = MCBinaryExpr::createOr(              \
+          C.compute_pgm_resource2_registers, Value, Ctx);                      \
+    }                                                                          \
+    return true;                                                               \
+  }
 #define RECORD(name, altName, print, parse) parse
-#include "AMDKernelCodeTInfo.h"
+#include "Utils/AMDKernelCodeTInfo.h"
 #undef RECORD
   };
   return ArrayRef(Table);
 }
 
-bool llvm::parseAmdKernelCodeField(StringRef ID,
-                                   MCAsmParser &MCParser,
-                                   amd_kernel_code_t &C,
-                                   raw_ostream &Err) {
+static void printAmdKernelCodeField(const AMDGPUMCKernelCodeT &C, int FldIndex,
+                                    raw_ostream &OS, MCContext &Ctx) {
+  auto Printer = getPrinterTable()[FldIndex];
+  if (Printer)
+    Printer(get_amd_kernel_code_t_FldNames()[FldIndex + 1], C, OS, Ctx);
+}
+
+void AMDGPUMCKernelCodeT::initDefault(const MCSubtargetInfo *STI,
+                                      MCContext &Ctx, bool InitMCExpr) {
+  AMDGPUMCKernelCodeT();
+
+  AMDGPU::initDefaultAMDKernelCodeT(*this, STI);
+
+  if (InitMCExpr) {
+    const MCExpr *ZeroExpr = MCConstantExpr::create(0, Ctx);
+    compute_pgm_resource1_registers =
+        MCConstantExpr::create(Lo_32(compute_pgm_resource_registers), Ctx);
+    compute_pgm_resource2_registers =
+        MCConstantExpr::create(Hi_32(compute_pgm_resource_registers), Ctx);
+    is_dynamic_callstack = ZeroExpr;
+    wavefront_sgpr_count = ZeroExpr;
+    workitem_vgpr_count = ZeroExpr;
+    workitem_private_segment_byte_size = ZeroExpr;
+  }
+}
+
+void AMDGPUMCKernelCodeT::validate(const MCSubtargetInfo *STI, MCContext &Ctx) {
+  int64_t Value;
+  if (!compute_pgm_resource1_registers->evaluateAsAbsolute(Value))
+    return;
+
+  if (G_00B848_DX10_CLAMP(Value) && AMDGPU::isGFX12Plus(*STI)) {
+    Ctx.reportError({}, "enable_dx10_clamp=1 is not allowed on GFX12+");
+    return;
+  }
+
+  if (G_00B848_IEEE_MODE(Value) && AMDGPU::isGFX12Plus(*STI)) {
+    Ctx.reportError({}, "enable_ieee_mode=1 is not allowed on GFX12+");
+    return;
+  }
+
+  if (G_00B848_WGP_MODE(Value) && !AMDGPU::isGFX10Plus(*STI)) {
+    Ctx.reportError({}, "enable_wgp_mode=1 is only allowed on GFX10+");
+    return;
+  }
+
+  if (G_00B848_MEM_ORDERED(Value) && !AMDGPU::isGFX10Plus(*STI)) {
+    Ctx.reportError({}, "enable_mem_ordered=1 is only allowed on GFX10+");
+    return;
+  }
+
+  if (G_00B848_FWD_PROGRESS(Value) && !AMDGPU::isGFX10Plus(*STI)) {
+    Ctx.reportError({}, "enable_fwd_progress=1 is only allowed on GFX10+");
+    return;
+  }
+}
+
+const MCExpr *&AMDGPUMCKernelCodeT::getMCExprForIndex(int Index) {
+  static const auto IndexTable = getMCExprIndexTable();
+  return IndexTable[Index](*this);
+}
+
+bool AMDGPUMCKernelCodeT::ParseKernelCodeT(StringRef ID, MCAsmParser &MCParser,
+                                           raw_ostream &Err) {
   const int Idx = get_amd_kernel_code_t_FieldIndex(ID);
   if (Idx < 0) {
     Err << "unexpected amd_kernel_code_t field name " << ID;
     return false;
   }
+
+  if (hasMCExprVersionTable()[Idx]) {
+    const MCExpr *Value;
+    if (!parseExpr(MCParser, Value, Err))
+      return false;
+    getMCExprForIndex(Idx) = Value;
+    return true;
+  }
   auto Parser = getParserTable()[Idx];
-  return Parser ? Parser(C, MCParser, Err) : false;
+  return Parser ? Parser(*this, MCParser, Err) : false;
+}
+
+void AMDGPUMCKernelCodeT::EmitKernelCodeT(raw_ostream &OS, MCContext &Ctx) {
+  const int Size = hasMCExprVersionTable().size();
+  for (int i = 0; i < Size; ++i) {
+    OS << "\t\t";
+    if (hasMCExprVersionTable()[i]) {
+      OS << get_amd_kernel_code_t_FldNames()[i + 1] << " = ";
+      int64_t Val;
+      const MCExpr *Value = getMCExprForIndex(i);
+      if (Value->evaluateAsAbsolute(Val))
+        OS << Val;
+      else
+        Value->print(OS, Ctx.getAsmInfo());
+    } else {
+      printAmdKernelCodeField(*this, i, OS, Ctx);
+    }
+    OS << '\n';
+  }
+}
+
+void AMDGPUMCKernelCodeT::EmitKernelCodeT(MCStreamer &OS, MCContext &Ctx) {
+  OS.emitIntValue(amd_kernel_code_version_major, /*Size=*/4);
+  OS.emitIntValue(amd_kernel_code_version_minor, /*Size=*/4);
+  OS.emitIntValue(amd_machine_kind, /*Size=*/2);
+  OS.emitIntValue(amd_machine_version_major, /*Size=*/2);
+  OS.emitIntValue(amd_machine_version_minor, /*Size=*/2);
+  OS.emitIntValue(amd_machine_version_stepping, /*Size=*/2);
+  OS.emitIntValue(kernel_code_entry_byte_offset, /*Size=*/8);
+  OS.emitIntValue(kernel_code_prefetch_byte_offset, /*Size=*/8);
+  OS.emitIntValue(kernel_code_prefetch_byte_size, /*Size=*/8);
+  OS.emitIntValue(reserved0, /*Size=*/8);
+
+  if (compute_pgm_resource1_registers != nullptr)
+    OS.emitValue(compute_pgm_resource1_registers, /*Size=*/4);
+  else
+    OS.emitIntValue(Lo_32(compute_pgm_resource_registers),
+                    /*Size=*/4);
+
+  if (compute_pgm_resource2_registers != nullptr)
+    OS.emitValue(compute_pgm_resource2_registers, /*Size=*/4);
+  else
+    OS.emitIntValue(Hi_32(compute_pgm_resource_registers),
+                    /*Size=*/4);
+
+  if (is_dynamic_callstack != nullptr) {
+    const MCExpr *CodeProps = MCConstantExpr::create(code_properties, Ctx);
+    CodeProps = MCBinaryExpr::createOr(
+        CodeProps,
+        MaskShiftSet(is_dynamic_callstack,
+                     (1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1,
+                     AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT, Ctx),
+        Ctx);
+    OS.emitValue(CodeProps, /*Size=*/4);
+  } else
+    OS.emitIntValue(code_properties, /*Size=*/4);
+
+  if (workitem_private_segment_byte_size != nullptr)
+    OS.emitValue(workitem_private_segment_byte_size, /*Size=*/4);
+  else
+    OS.emitIntValue(0, /*Size=*/4);
+
+  OS.emitIntValue(workgroup_group_segment_byte_size, /*Size=*/4);
+  OS.emitIntValue(gds_segment_byte_size, /*Size=*/4);
+  OS.emitIntValue(kernarg_segment_byte_size, /*Size=*/8);
+  OS.emitIntValue(workgroup_fbarrier_count, /*Size=*/4);
+
+  if (wavefront_sgpr_count != nullptr)
+    OS.emitValue(wavefront_sgpr_count, /*Size=*/2);
+  else
+    OS.emitIntValue(0, /*Size=*/2);
+
+  if (workitem_vgpr_count != nullptr)
+    OS.emitValue(workitem_vgpr_count, /*Size=*/2);
+  else
+    OS.emitIntValue(0, /*Size=*/2);
+
+  OS.emitIntValue(reserved_vgpr_first, /*Size=*/2);
+  OS.emitIntValue(reserved_vgpr_count, /*Size=*/2);
+  OS.emitIntValue(reserved_sgpr_first, /*Size=*/2);
+  OS.emitIntValue(reserved_sgpr_count, /*Size=*/2);
+  OS.emitIntValue(debug_wavefront_private_segment_offset_sgpr,
+                  /*Size=*/2);
+  OS.emitIntValue(debug_private_segment_buffer_sgpr, /*Size=*/2);
+  OS.emitIntValue(kernarg_segment_alignment, /*Size=*/1);
+  OS.emitIntValue(group_segment_alignment, /*Size=*/1);
+  OS.emitIntValue(private_segment_alignment, /*Size=*/1);
+  OS.emitIntValue(wavefront_size, /*Size=*/1);
+
+  OS.emitIntValue(call_convention, /*Size=*/4);
+  OS.emitBytes(StringRef((const char *)reserved3, /*Size=*/12));
+  OS.emitIntValue(runtime_loader_kernel_symbol, /*Size=*/8);
+  OS.emitBytes(StringRef((const char *)control_directives, /*Size=*/16 * 8));
 }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
index 41d0e0d745e5..6aeb98f1ce14 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.h
@@ -7,29 +7,84 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file AMDKernelCodeTUtils.h
+/// MC layer struct for AMDGPUMCKernelCodeT, provides MCExpr functionality where
+/// required.
+///
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDKERNELCODETUTILS_H
-#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDKERNELCODETUTILS_H
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCKERNELCODET_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCKERNELCODET_H
 
-struct amd_kernel_code_t;
+#include "AMDKernelCodeT.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
 
 namespace llvm {
-
 class MCAsmParser;
+class MCContext;
+class MCExpr;
+class MCStreamer;
+class MCSubtargetInfo;
 class raw_ostream;
-class StringRef;
+namespace AMDGPU {
+
+struct AMDGPUMCKernelCodeT {
+  AMDGPUMCKernelCodeT() = default;
+
+  // Names of most (if not all) members should match the ones used for table
+  // driven (array) generation in AMDKernelCodeTInfo.h.
+  uint32_t amd_kernel_code_version_major = 0;
+  uint32_t amd_kernel_code_version_minor = 0;
+  uint16_t amd_machine_kind = 0;
+  uint16_t amd_machine_version_major = 0;
+  uint16_t amd_machine_version_minor = 0;
+  uint16_t amd_machine_version_stepping = 0;
+  int64_t kernel_code_entry_byte_offset = 0;
+  int64_t kernel_code_prefetch_byte_offset = 0;
+  uint64_t kernel_code_prefetch_byte_size = 0;
+  uint64_t reserved0 = 0;
+  uint64_t compute_pgm_resource_registers = 0;
+  uint32_t code_properties = 0;
+  uint32_t workgroup_group_segment_byte_size = 0;
+  uint32_t gds_segment_byte_size = 0;
+  uint64_t kernarg_segment_byte_size = 0;
+  uint32_t workgroup_fbarrier_count = 0;
+  uint16_t reserved_vgpr_first = 0;
+  uint16_t reserved_vgpr_count = 0;
+  uint16_t reserved_sgpr_first = 0;
+  uint16_t reserved_sgpr_count = 0;
+  uint16_t debug_wavefront_private_segment_offset_sgpr = 0;
+  uint16_t debug_private_segment_buffer_sgpr = 0;
+  uint8_t kernarg_segment_alignment = 0;
+  uint8_t group_segment_alignment = 0;
+  uint8_t private_segment_alignment = 0;
+  uint8_t wavefront_size = 0;
+  int32_t call_convention = 0;
+  uint8_t reserved3[12] = {0};
+  uint64_t runtime_loader_kernel_symbol = 0;
+  uint64_t control_directives[16] = {0};
+
+  const MCExpr *compute_pgm_resource1_registers = nullptr;
+  const MCExpr *compute_pgm_resource2_registers = nullptr;
+
+  const MCExpr *is_dynamic_callstack = nullptr;
+  const MCExpr *wavefront_sgpr_count = nullptr;
+  const MCExpr *workitem_vgpr_count = nullptr;
+  const MCExpr *workitem_private_segment_byte_size = nullptr;
 
-void printAmdKernelCodeField(const amd_kernel_code_t &C, int FldIndex,
-                             raw_ostream &OS);
+  void initDefault(const MCSubtargetInfo *STI, MCContext &Ctx,
+                   bool InitMCExpr = true);
+  void validate(const MCSubtargetInfo *STI, MCContext &Ctx);
 
-void dumpAmdKernelCode(const amd_kernel_code_t *C, raw_ostream &OS,
-                       const char *tab);
+  const MCExpr *&getMCExprForIndex(int Index);
 
-bool parseAmdKernelCodeField(StringRef ID, MCAsmParser &Parser,
-                             amd_kernel_code_t &C, raw_ostream &Err);
+  bool ParseKernelCodeT(StringRef ID, MCAsmParser &MCParser, raw_ostream &Err);
+  void EmitKernelCodeT(raw_ostream &OS, MCContext &Ctx);
+  void EmitKernelCodeT(MCStreamer &OS, MCContext &Ctx);
+};
 
+} // end namespace AMDGPU
 } // end namespace llvm
 
-#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDKERNELCODETUTILS_H
+#endif // LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCKERNELCODET_H
diff --git a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
index 19d3b690b131..2f4ce8eaf1d6 100644
--- a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_component_library(LLVMAMDGPUUtils
   CodeGenTypes
   Core
   MC
+  MCParser
   Support
   TargetParser
 
diff --git a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
index f609305bfee4..91ffbc4eb77d 100644
--- a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -1119,18 +1119,24 @@ def : MipsPat<(select i32:$cond, immz, i32:$f),
 
 // llvm.fmin/fmax operations.
 let AdditionalPredicates = [NotInMicroMips] in {
-  def : MipsPat<(fmaxnum f32:$lhs, f32:$rhs),
+  def : MipsPat<(fmaxnum_ieee f32:$lhs, f32:$rhs),
                 (MAX_S   f32:$lhs, f32:$rhs)>,
                 ISA_MIPS32R6;
-  def : MipsPat<(fmaxnum f64:$lhs, f64:$rhs),
+  def : MipsPat<(fmaxnum_ieee f64:$lhs, f64:$rhs),
                 (MAX_D   f64:$lhs, f64:$rhs)>,
                 ISA_MIPS32R6;
-  def : MipsPat<(fminnum f32:$lhs, f32:$rhs),
+  def : MipsPat<(fminnum_ieee f32:$lhs, f32:$rhs),
                 (MIN_S   f32:$lhs, f32:$rhs)>,
                 ISA_MIPS32R6;
-  def : MipsPat<(fminnum f64:$lhs, f64:$rhs),
+  def : MipsPat<(fminnum_ieee f64:$lhs, f64:$rhs),
                 (MIN_D   f64:$lhs, f64:$rhs)>,
                 ISA_MIPS32R6;
+  def : MipsPat<(f32 (fcanonicalize f32:$src)),
+                (MIN_S   f32:$src, f32:$src)>,
+                ISA_MIPS32R6;
+  def : MipsPat<(f64 (fcanonicalize f64:$src)),
+                (MIN_D   f64:$src, f64:$src)>,
+                ISA_MIPS32R6;
 }
 
 // Pseudo instructions
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 459164fa7a29..c2be8c80b7a8 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -360,11 +360,15 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
 
   // Lower fmin and fmax operations for MIPS R6.
   // Instructions are defined but never used.
-  if (Subtarget.hasMips32r6() || Subtarget.hasMips64r6()) {
-    setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
-    setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
-    setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
-    setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+  if (Subtarget.hasMips32r6()) {
+    setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
+    setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::f32, Expand);
+    setOperationAction(ISD::FMAXNUM, MVT::f32, Expand);
+    setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
+    setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::f64, Expand);
+    setOperationAction(ISD::FMAXNUM, MVT::f64, Expand);
   }
 
   if (Subtarget.isGP64bit()) {
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index ac48dc5af9d5..f4e84ade3b5a 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1157,12 +1157,12 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
     MCSymbolRefExpr::VariantKind VK = GetVKForMO(MO);
 
-    // If the symbol isn't toc-data then use the TOC on AIX.
     // Map the global address operand to be a reference to the TOC entry we
     // will synthesize later. 'TOCEntry' is a label used to reference the
     // storage allocated in the TOC which contains the address of 'MOSymbol'.
-    // If the toc-data attribute is used, the TOC entry contains the data
-    // rather than the address of the MOSymbol.
+    // If the symbol does not have the toc-data attribute, then we create the
+    // TOC entry on AIX. If the toc-data attribute is used, the TOC entry
+    // contains the data rather than the address of the MOSymbol.
     if (![](const MachineOperand &MO) {
           if (!MO.isGlobal())
             return false;
@@ -1170,7 +1170,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
           const GlobalVariable *GV = dyn_cast<GlobalVariable>(MO.getGlobal());
           if (!GV)
             return false;
-
           return GV->hasAttribute("toc-data");
         }(MO)) {
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol, getTOCEntryTypeForMO(MO), VK);
@@ -1301,8 +1300,10 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
     unsigned Op = MI->getOpcode();
 
-    // Change the opcode to load address for tocdata
-    TmpInst.setOpcode(Op == PPC::ADDItocL8 ? PPC::ADDI8 : PPC::LA);
+    // Change the opcode to load address for toc-data.
+    // ADDItocL is only used for 32-bit toc-data on AIX and will always use LA.
+    TmpInst.setOpcode(Op == PPC::ADDItocL8 ? (IsAIX ? PPC::LA8 : PPC::ADDI8)
+                                           : PPC::LA);
 
     const MachineOperand &MO = MI->getOperand(2);
     assert((Op == PPC::ADDItocL8)
@@ -1316,8 +1317,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
     const MCExpr *Exp = MCSymbolRefExpr::create(
         MOSymbol,
-        Op == PPC::ADDItocL8 ? MCSymbolRefExpr::VK_PPC_TOC_LO
-                             : MCSymbolRefExpr::VK_PPC_L,
+        IsAIX ? MCSymbolRefExpr::VK_PPC_L : MCSymbolRefExpr::VK_PPC_TOC_LO,
         OutContext);
 
     TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
@@ -2831,8 +2831,10 @@ void PPCAIXAsmPrinter::emitGlobalVariableHelper(const GlobalVariable *GV) {
 
   // When -fdata-sections is enabled, every GlobalVariable will
   // be put into its own csect; therefore, label is not necessary here.
-  if (!TM.getDataSections() || GV->hasSection())
-    OutStreamer->emitLabel(EmittedInitSym);
+  if (!TM.getDataSections() || GV->hasSection()) {
+    if (Csect->getMappingClass() != XCOFF::XMC_TD)
+      OutStreamer->emitLabel(EmittedInitSym);
+  }
 
   // No alias to emit.
   if (!GOAliasMap[GV].size()) {
diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index 6e31cdae8476..735050641adf 100644
--- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -2074,16 +2074,15 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
   if (GV->isThreadLocal())
     return 0;
 
-  // If the global has the toc-data attribute then fallback to DAG-ISEL.
-  if (TM.getTargetTriple().isOSAIX())
-    if (const GlobalVariable *Var = dyn_cast_or_null<GlobalVariable>(GV))
-      if (Var->hasAttribute("toc-data"))
-        return false;
-
   PPCFuncInfo->setUsesTOCBasePtr();
+  bool IsAIXTocData = TM.getTargetTriple().isOSAIX() &&
+                      isa<GlobalVariable>(GV) &&
+                      cast<GlobalVariable>(GV)->hasAttribute("toc-data");
+
   // For small code model, generate a simple TOC load.
   if (CModel == CodeModel::Small)
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(PPC::LDtoc),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
+            IsAIXTocData ? TII.get(PPC::ADDItoc8) : TII.get(PPC::LDtoc),
             DestReg)
         .addGlobalAddress(GV)
         .addReg(PPC::X2);
@@ -2101,6 +2100,7 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
             HighPartReg).addReg(PPC::X2).addGlobalAddress(GV);
 
     if (Subtarget->isGVIndirectSymbol(GV)) {
+      assert(!IsAIXTocData && "TOC data should always be direct.");
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(PPC::LDtocL),
               DestReg).addGlobalAddress(GV).addReg(HighPartReg);
     } else {
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 68621558e3fa..26560dc5cdeb 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -6143,23 +6143,22 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
            " ELF/AIX or 32-bit AIX in the following.");
 
     // Transforms the ISD::TOC_ENTRY node for 32-bit AIX large code model mode,
-    // or 64-bit medium (ELF-only), or large (ELF and AIX) code model code that
-    // does not conain TOC data symbols.
-    // We generate two instructions as described below. The first source
-    // operand is a symbol reference. If it must be referenced via the toc
-    // according to Subtarget, we generate:
+    // 64-bit medium (ELF-only), or 64-bit large (ELF and AIX) code model code
+    // that does not contain TOC data symbols. We generate two instructions as
+    // described below. The first source operand is a symbol reference. If it
+    // must be referenced via the TOC according to Subtarget, we generate:
     // [32-bit AIX]
     //   LWZtocL(@sym, ADDIStocHA(%r2, @sym))
     // [64-bit ELF/AIX]
     //   LDtocL(@sym, ADDIStocHA8(%x2, @sym))
-    // Otherwise we generate:
+    // Otherwise for medium code model ELF we generate:
     //   ADDItocL8(ADDIStocHA8(%x2, @sym), @sym)
 
-    // For large code model with TOC data symbols we generate:
+    // And finally for AIX with toc-data we generate:
     // [32-bit AIX]
     //   ADDItocL(ADDIStocHA(%x2, @sym), @sym)
     // [64-bit AIX]
-    //   Currently not supported.
+    //   ADDItocL8(ADDIStocHA8(%x2, @sym), @sym)
 
     SDValue GA = N->getOperand(0);
     SDValue TOCbase = N->getOperand(1);
@@ -6171,12 +6170,9 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     // On AIX, if the symbol has the toc-data attribute it will be defined
     // in the TOC entry, so we use an ADDItocL/ADDItocL8.
     if (isAIXABI && hasTocDataAttr(GA)) {
-      if (isPPC64)
-        report_fatal_error(
-            "64-bit large code model toc-data not yet supported");
-
-      ReplaceNode(N, CurDAG->getMachineNode(PPC::ADDItocL, dl, VT,
-                                            SDValue(Tmp, 0), GA));
+      ReplaceNode(
+          N, CurDAG->getMachineNode(isPPC64 ? PPC::ADDItocL8 : PPC::ADDItocL,
+                                    dl, VT, SDValue(Tmp, 0), GA));
       return;
     }
 
@@ -6191,6 +6187,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       return;
     }
 
+    assert(isPPC64 && "TOC_ENTRY already handled for 32-bit.");
     // Build the address relative to the TOC-pointer.
     ReplaceNode(N, CurDAG->getMachineNode(PPC::ADDItocL8, dl, MVT::i64,
                                           SDValue(Tmp, 0), GA));
@@ -7777,6 +7774,10 @@ void PPCDAGToDAGISel::PeepholePPC64() {
       Flags = PPCII::MO_TLSLD_LO;
       break;
     case PPC::ADDItocL8:
+      // Skip the following peephole optimizations for ADDItocL8 on AIX which
+      // is used for toc-data access.
+      if (Subtarget->isAIXABI())
+        continue;
       Flags = PPCII::MO_TOC_LO;
       break;
     }
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 9e56de732c58..85bbfabf5d3c 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -4438,6 +4438,12 @@ bool PPCInstrInfo::isDefMIElgibleForForwarding(MachineInstr &DefMI,
   if (Opc != PPC::ADDItocL8 && Opc != PPC::ADDI && Opc != PPC::ADDI8)
     return false;
 
+  // Skip the optimization of transformTo[NewImm|Imm]FormFedByAdd for ADDItocL8
+  // on AIX which is used for toc-data access. TODO: Follow up to see if it can
+  // apply for AIX toc-data as well.
+  if (Opc == PPC::ADDItocL8 && Subtarget.isAIX())
+    return false;
+
   assert(DefMI.getNumOperands() >= 3 &&
          "Add inst must have at least three operands");
   RegMO = &DefMI.getOperand(1);
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 7929a781dbda..e3d6d2f094f2 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -3346,7 +3346,7 @@ def ADDIStocHA : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, tocentr
                        "#ADDIStocHA",
                        [(set i32:$rD,
                          (PPCtoc_entry i32:$reg, tglobaladdr:$disp))]>;
-// TOC Data Transform AIX
+// TOC Data Transform on AIX
 def ADDItoc : PPCEmitTimePseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg),
                    "#ADDItoc",
                    [(set i32:$rD,
diff --git a/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp b/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
index 4c9f5ff18bb6..d10fe11bb587 100644
--- a/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
@@ -29,8 +29,10 @@ using namespace llvm;
 namespace {
 
 static StringRef MASSVFuncs[] = {
-#define TLI_DEFINE_MASSV_VECFUNCS_NAMES
+#define TLI_DEFINE_MASSV_VECFUNCS
+#define TLI_DEFINE_VECFUNC(SCAL, VEC, VF, VABI_PREFIX) VEC,
 #include "llvm/Analysis/VecFuncs.def"
+#undef TLI_DEFINE_MASSV_VECFUNCS
 };
 
 class PPCLowerMASSVEntries : public ModulePass {
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index c73fe2c6cecb..dbfcab7233bf 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -130,6 +130,10 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
 
   getActionDefinitionsBuilder({G_SADDO, G_SSUBO}).minScalar(0, sXLen).lower();
 
+  // TODO: Use Vector Single-Width Saturating Instructions for vector types.
+  getActionDefinitionsBuilder({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT})
+      .lower();
+
   auto &ShiftActions = getActionDefinitionsBuilder({G_ASHR, G_LSHR, G_SHL});
   if (ST.is64Bit())
     ShiftActions.customFor({{s32, s32}});
@@ -137,7 +141,8 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
       .widenScalarToNextPow2(0)
       .clampScalar(1, s32, sXLen)
       .clampScalar(0, s32, sXLen)
-      .minScalarSameAs(1, 0);
+      .minScalarSameAs(1, 0)
+      .widenScalarToNextPow2(1);
 
   auto &ExtActions =
       getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
@@ -344,6 +349,9 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
         .widenScalarToNextPow2(0);
   }
 
+  // TODO: Use libcall for sDoubleXLen.
+  getActionDefinitionsBuilder({G_UDIVREM, G_SDIVREM}).lower();
+
   auto &AbsActions = getActionDefinitionsBuilder(G_ABS);
   if (ST.hasStdExtZbb())
     AbsActions.customFor({s32, sXLen}).minScalar(0, sXLen);
@@ -367,6 +375,11 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
                                G_FABS, G_FSQRT, G_FMAXNUM, G_FMINNUM})
       .legalIf(typeIsScalarFPArith(0, ST));
 
+  getActionDefinitionsBuilder(G_FREM)
+      .libcallFor({s32, s64})
+      .minScalar(0, s32)
+      .scalarize(0);
+
   getActionDefinitionsBuilder(G_FCOPYSIGN)
       .legalIf(all(typeIsScalarFPArith(0, ST), typeIsScalarFPArith(1, ST)));
 
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index b099496d1838..a78d78946be3 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -152,7 +152,8 @@ def HasStdExtZimop : Predicate<"Subtarget->hasStdExtZimop()">,
 
 def FeatureStdExtZicfilp
     : RISCVExperimentalExtension<"zicfilp", 0, 4,
-                                 "'Zicfilp' (Landing pad)">;
+                                 "'Zicfilp' (Landing pad)",
+                                 [FeatureStdExtZicsr]>;
 def HasStdExtZicfilp : Predicate<"Subtarget->hasStdExtZicfilp()">,
                        AssemblerPredicate<(all_of FeatureStdExtZicfilp),
                                           "'Zicfilp' (Landing pad)">;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 06f85698d296..f0e5a7d393b6 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -688,7 +688,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         ISD::VP_ADD,         ISD::VP_SUB,         ISD::VP_MUL,
         ISD::VP_SDIV,        ISD::VP_UDIV,        ISD::VP_SREM,
         ISD::VP_UREM,        ISD::VP_AND,         ISD::VP_OR,
-        ISD::VP_XOR,         ISD::VP_ASHR,        ISD::VP_LSHR,
+        ISD::VP_XOR,         ISD::VP_SRA,         ISD::VP_SRL,
         ISD::VP_SHL,         ISD::VP_REDUCE_ADD,  ISD::VP_REDUCE_AND,
         ISD::VP_REDUCE_OR,   ISD::VP_REDUCE_XOR,  ISD::VP_REDUCE_SMAX,
         ISD::VP_REDUCE_SMIN, ISD::VP_REDUCE_UMAX, ISD::VP_REDUCE_UMIN,
@@ -1919,7 +1919,7 @@ bool RISCVTargetLowering::hasAndNotCompare(SDValue Y) const {
     return false;
 
   return (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbkb()) &&
-         !isa<ConstantSDNode>(Y);
+         (!isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque());
 }
 
 bool RISCVTargetLowering::hasBitTest(SDValue X, SDValue Y) const {
@@ -5341,7 +5341,7 @@ RISCVTargetLowering::lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op,
   SDValue Exp;
   // Restore back to original type. Truncation after SRL is to generate vnsrl.
   if (Op->isVPOpcode()) {
-    Exp = DAG.getNode(ISD::VP_LSHR, DL, IntVT, Bitcast,
+    Exp = DAG.getNode(ISD::VP_SRL, DL, IntVT, Bitcast,
                       DAG.getConstant(ShiftAmt, DL, IntVT), Mask, VL);
     Exp = DAG.getVPZExtOrTrunc(DL, VT, Exp, Mask, VL);
   } else {
@@ -5923,9 +5923,9 @@ static unsigned getRISCVVLOp(SDValue Op) {
   case ISD::VP_SELECT:
   case ISD::VP_MERGE:
     return RISCVISD::VMERGE_VL;
-  case ISD::VP_ASHR:
+  case ISD::VP_SRA:
     return RISCVISD::SRA_VL;
-  case ISD::VP_LSHR:
+  case ISD::VP_SRL:
     return RISCVISD::SRL_VL;
   case ISD::VP_SQRT:
     return RISCVISD::FSQRT_VL;
@@ -7010,8 +7010,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
          !Subtarget.hasVInstructionsF16()))
       return SplitVPOp(Op, DAG);
     [[fallthrough]];
-  case ISD::VP_ASHR:
-  case ISD::VP_LSHR:
+  case ISD::VP_SRA:
+  case ISD::VP_SRL:
   case ISD::VP_SHL:
     return lowerVPOp(Op, DAG);
   case ISD::VP_IS_FPCLASS:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 9d574edb4e6d..ce50fe6e2cbb 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1560,8 +1560,8 @@ def PseudoJump : Pseudo<(outs GPR:$rd), (ins pseudo_jump_symbol:$target), [],
 // -riscv-use-rematerializable-movimm in RISCVISelDAGToDAG.cpp
 // It will be expanded after register allocation.
 // FIXME: The scheduling information does not reflect the multiple instructions.
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 8, isCodeGenOnly = 1,
-    isPseudo = 1, isReMaterializable = 1, IsSignExtendingOpW = 1 in
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 8,
+    isReMaterializable = 1 in
 def PseudoMovImm : Pseudo<(outs GPR:$dst), (ins i32imm:$imm), []>,
                    Sched<[WriteIALU]>;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index 0bbf71519953..b5817237b7fd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -1680,8 +1680,9 @@ let Predicates = [HasVInstructions] in {
 let Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather in {
 defm VRGATHER_V : VGTR_IV_V_X_I<"vrgather", 0b001100>;
 def VRGATHEREI16_VV : VALUVV<0b001110, OPIVV, "vrgatherei16.vv">,
-                      SchedBinaryMC<"WriteVRGatherVV", "ReadVRGatherVV_data",
-                                    "ReadVRGatherVV_index">;
+                      SchedBinaryMC<"WriteVRGatherEI16VV",
+                                    "ReadVRGatherEI16VV_data",
+                                    "ReadVRGatherEI16VV_index">;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather
 
 // Vector Compress Instruction
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 8bf0f25d496a..f2c867a08ec2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -2249,13 +2249,13 @@ multiclass VPseudoBinaryFV_VV_RM<LMULInfo m, string Constraint = "", int sew = 0
                                        UsesVXRM=0>;
 }
 
-multiclass VPseudoVGTR_VV_EEW<int eew, string Constraint = ""> {
+multiclass VPseudoVGTR_EI16_VV<string Constraint = ""> {
   foreach m = MxList in {
     defvar mx = m.MX;
     foreach sew = EEWList in {
       defvar dataEMULOctuple = m.octuple;
-      // emul = lmul * eew / sew
-      defvar idxEMULOctuple = !srl(!mul(dataEMULOctuple, eew), !logtwo(sew));
+      // emul = lmul * 16 / sew
+      defvar idxEMULOctuple = !srl(!mul(dataEMULOctuple, 16), !logtwo(sew));
       if !and(!ge(idxEMULOctuple, 1), !le(idxEMULOctuple, 64)) then {
         defvar emulMX = octuple_to_str<idxEMULOctuple>.ret;
         defvar emul = !cast<LMULInfo>("V_" # emulMX);
@@ -2264,8 +2264,8 @@ multiclass VPseudoVGTR_VV_EEW<int eew, string Constraint = ""> {
           defm _VV
               : VPseudoBinaryEmul<m.vrclass, m.vrclass, emul.vrclass, m, emul,
                                   Constraint, e>,
-                SchedBinary<"WriteVRGatherVV", "ReadVRGatherVV_data",
-                            "ReadVRGatherVV_index", mx, e, forceMergeOpRead=true>;
+                SchedBinary<"WriteVRGatherEI16VV", "ReadVRGatherEI16VV_data",
+                            "ReadVRGatherEI16VV_index", mx, e, forceMergeOpRead=true>;
         }
       }
     }
@@ -6879,8 +6879,7 @@ let Predicates = [HasVInstructionsAnyF] in {
 //===----------------------------------------------------------------------===//
 let Predicates = [HasVInstructions] in {
 defm PseudoVRGATHER     : VPseudoVGTR_VV_VX_VI<uimm5, "@earlyclobber $rd">;
-defm PseudoVRGATHEREI16 : VPseudoVGTR_VV_EEW<eew=16,
-                                             Constraint="@earlyclobber $rd">;
+defm PseudoVRGATHEREI16 : VPseudoVGTR_EI16_VV<Constraint = "@earlyclobber $rd">;
 
 //===----------------------------------------------------------------------===//
 // 16.5. Vector Compress Instruction
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index a4a5d9e96c27..6ebf9f1eb045 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -85,7 +85,7 @@ def ROCKET : RISCVTuneProcessorModel<"rocket",
 
 def SIFIVE_7 : RISCVTuneProcessorModel<"sifive-7-series",
                                        SiFive7Model,
-                                       [TuneSiFive7]>;
+                                       [TuneSiFive7, FeaturePostRAScheduler]>;
 
 def SIFIVE_E20 : RISCVProcessorModel<"sifive-e20",
                                      RocketModel,
@@ -145,7 +145,7 @@ def SIFIVE_E76 : RISCVProcessorModel<"sifive-e76",
                                       FeatureStdExtA,
                                       FeatureStdExtF,
                                       FeatureStdExtC],
-                                     [TuneSiFive7]>;
+                                     [TuneSiFive7, FeaturePostRAScheduler]>;
 
 def SIFIVE_S21 : RISCVProcessorModel<"sifive-s21",
                                      RocketModel,
@@ -189,7 +189,7 @@ def SIFIVE_S76 : RISCVProcessorModel<"sifive-s76",
                                       FeatureStdExtD,
                                       FeatureStdExtC,
                                       FeatureStdExtZihintpause],
-                                     [TuneSiFive7]>;
+                                     [TuneSiFive7, FeaturePostRAScheduler]>;
 
 def SIFIVE_U54 : RISCVProcessorModel<"sifive-u54",
                                      RocketModel,
@@ -212,7 +212,7 @@ def SIFIVE_U74 : RISCVProcessorModel<"sifive-u74",
                                       FeatureStdExtF,
                                       FeatureStdExtD,
                                       FeatureStdExtC],
-                                     [TuneSiFive7]>;
+                                     [TuneSiFive7, FeaturePostRAScheduler]>;
 
 def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,
                                       [Feature64Bit,
@@ -230,6 +230,7 @@ def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,
                                        FeatureStdExtZba,
                                        FeatureStdExtZbb],
                                       [TuneSiFive7,
+                                       FeaturePostRAScheduler,
                                        TuneDLenFactor2]>;
 
 def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
@@ -262,7 +263,8 @@ def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
                                       [TuneNoDefaultUnroll,
                                        TuneConditionalCompressedMoveFusion,
                                        TuneLUIADDIFusion,
-                                       TuneAUIPCADDIFusion]>;
+                                       TuneAUIPCADDIFusion,
+                                       FeaturePostRAScheduler]>;
 
 def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
                                       [Feature64Bit,
@@ -302,7 +304,8 @@ def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
                                        TuneConditionalCompressedMoveFusion,
                                        TuneLUIADDIFusion,
                                        TuneAUIPCADDIFusion,
-                                       TuneNoSinkSplatOperands]>;
+                                       TuneNoSinkSplatOperands,
+                                       FeaturePostRAScheduler]>;
 
 def SYNTACORE_SCR1_BASE : RISCVProcessorModel<"syntacore-scr1-base",
                                               SyntacoreSCR1Model,
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 83fb75727bbe..b2991145ee65 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -199,7 +199,6 @@ def SiFive7Model : SchedMachineModel {
   let LoadLatency = 3;
   let MispredictPenalty = 3;
   let CompleteModel = 0;
-  let PostRAScheduler = true;
   let EnableIntervals = true;
   let UnsupportedFeatures = [HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx,
                              HasStdExtZcmt, HasStdExtZknd, HasStdExtZkne,
@@ -928,6 +927,7 @@ foreach mx = SchedMxList in {
     defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
     let Latency = !add(Cycles, 3), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
       defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
     }
   }
@@ -1273,6 +1273,8 @@ defm "" : LMULReadAdvance<"ReadVFSlideV", 0>;
 defm "" : LMULReadAdvance<"ReadVFSlideF", 0>;
 defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_data", 0>;
 defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_index", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_data", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_index", 0>;
 defm "" : LMULReadAdvance<"ReadVRGatherVX_data", 0>;
 defm "" : LMULReadAdvance<"ReadVRGatherVX_index", 0>;
 defm "" : LMULReadAdvance<"ReadVRGatherVI_data", 0>;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
index a37958826e02..80362cae00fc 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
@@ -13,7 +13,6 @@ def SiFiveP400Model : SchedMachineModel {
   let MicroOpBufferSize = 56; // Max micro-ops that can be buffered.
   let LoadLatency = 4;        // Cycles for loads to access the cache.
   let MispredictPenalty = 9;  // Extra cycles for a mispredicted branch.
-  let PostRAScheduler = true;
   let UnsupportedFeatures = [HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx,
                              HasStdExtZcmt, HasStdExtZknd, HasStdExtZkne,
                              HasStdExtZknh, HasStdExtZksed, HasStdExtZksh,
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
index 07d72b61862d..f0697a1b0673 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
@@ -56,7 +56,6 @@ def SiFiveP600Model : SchedMachineModel {
   let MicroOpBufferSize = 160; // Max micro-ops that can be buffered.
   let LoadLatency = 4;        // Cycles for loads to access the cache.
   let MispredictPenalty = 9;  // Extra cycles for a mispredicted branch.
-  let PostRAScheduler = true;
   let UnsupportedFeatures = [HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx,
                              HasStdExtZknd, HasStdExtZkne, HasStdExtZknh,
                              HasStdExtZksed, HasStdExtZksh, HasStdExtZkr,
@@ -716,6 +715,7 @@ foreach mx = ["MF8", "MF4", "MF2", "M1"] in {
     defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
     let Latency = 3, ReleaseAtCycles = [1] in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SiFiveP600VEXQ1], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SiFiveP600VEXQ1], mx, sew, IsWorstCase>;
       defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SiFiveP600VEXQ1], mx, sew, IsWorstCase>;
     }
   }
@@ -736,6 +736,7 @@ foreach mx = ["M2", "M4", "M8"] in {
     defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
     let Latency = 6, ReleaseAtCycles = [LMulLat] in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SiFiveP600VEXQ1], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SiFiveP600VEXQ1], mx, sew, IsWorstCase>;
       defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SiFiveP600VEXQ1], mx, sew, IsWorstCase>;
     }
   }
@@ -1071,6 +1072,8 @@ defm "" : LMULReadAdvance<"ReadVFSlideV", 0>;
 defm "" : LMULReadAdvance<"ReadVFSlideF", 0>;
 defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_data", 0>;
 defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_index", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_data", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_index", 0>;
 defm "" : LMULReadAdvance<"ReadVRGatherVX_data", 0>;
 defm "" : LMULReadAdvance<"ReadVRGatherVX_index", 0>;
 defm "" : LMULReadAdvance<"ReadVRGatherVI_data", 0>;
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td
index e4524185991e..449611c58303 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleV.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td
@@ -521,6 +521,7 @@ defm "" : LMULSchedWrites<"WriteVISlide1X">;
 defm "" : LMULSchedWrites<"WriteVFSlide1F">;
 // 16.4. Vector Register Gather Instructions
 defm "" : LMULSEWSchedWrites<"WriteVRGatherVV">;
+defm "" : LMULSEWSchedWrites<"WriteVRGatherEI16VV">;
 defm "" : LMULSchedWrites<"WriteVRGatherVX">;
 defm "" : LMULSchedWrites<"WriteVRGatherVI">;
 // 16.5. Vector Compress Instruction
@@ -749,6 +750,8 @@ defm "" : LMULSchedReads<"ReadVFSlideF">;
 // 16.4. Vector Register Gather Instructions
 defm "" : LMULSEWSchedReads<"ReadVRGatherVV_data">;
 defm "" : LMULSEWSchedReads<"ReadVRGatherVV_index">;
+defm "" : LMULSEWSchedReads<"ReadVRGatherEI16VV_data">;
+defm "" : LMULSEWSchedReads<"ReadVRGatherEI16VV_index">;
 defm "" : LMULSchedReads<"ReadVRGatherVX_data">;
 defm "" : LMULSchedReads<"ReadVRGatherVX_index">;
 defm "" : LMULSchedReads<"ReadVRGatherVI_data">;
@@ -956,6 +959,7 @@ defm "" : LMULWriteRes<"WriteVSlideI", []>;
 defm "" : LMULWriteRes<"WriteVISlide1X", []>;
 defm "" : LMULWriteRes<"WriteVFSlide1F", []>;
 defm "" : LMULSEWWriteRes<"WriteVRGatherVV", []>;
+defm "" : LMULSEWWriteRes<"WriteVRGatherEI16VV", []>;
 defm "" : LMULWriteRes<"WriteVRGatherVX", []>;
 defm "" : LMULWriteRes<"WriteVRGatherVI", []>;
 defm "" : LMULSEWWriteRes<"WriteVCompressV", []>;
@@ -1120,6 +1124,8 @@ defm "" : LMULReadAdvance<"ReadVFSlideV", 0>;
 defm "" : LMULReadAdvance<"ReadVFSlideF", 0>;
 defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_data", 0>;
 defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_index", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_data", 0>;
+defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_index", 0>;
 defm "" : LMULReadAdvance<"ReadVRGatherVX_data", 0>;
 defm "" : LMULReadAdvance<"ReadVRGatherVX_index", 0>;
 defm "" : LMULReadAdvance<"ReadVRGatherVI_data", 0>;
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index c880c9e921e0..347c1bc3c278 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -121,9 +121,7 @@ public:
   }
   bool enableMachineScheduler() const override { return true; }
 
-  bool enablePostRAScheduler() const override {
-    return getSchedModel().PostRAScheduler || UsePostRAScheduler;
-  }
+  bool enablePostRAScheduler() const override { return UsePostRAScheduler; }
 
   Align getPrefFunctionAlignment() const {
     return Align(TuneInfo->PrefFunctionAlignment);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index ca8279672c09..176d0e79253a 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1881,10 +1881,14 @@ unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
 bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                                  const TargetTransformInfo::LSRCost &C2) {
   // RISC-V specific here are "instruction number 1st priority".
-  return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
+  // If we need to emit adds inside the loop to add up base registers, then
+  // we need at least one extra temporary register.
+  unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
+  unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
+  return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
                   C1.NumIVMuls, C1.NumBaseAdds,
                   C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
-         std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
+         std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
                   C2.NumIVMuls, C2.NumBaseAdds,
                   C2.ScaleCost, C2.ImmCost, C2.SetupCost);
 }
diff --git a/llvm/lib/Target/SPIRV/CMakeLists.txt b/llvm/lib/Target/SPIRV/CMakeLists.txt
index 7001ac382f41..fe09d5903045 100644
--- a/llvm/lib/Target/SPIRV/CMakeLists.txt
+++ b/llvm/lib/Target/SPIRV/CMakeLists.txt
@@ -17,6 +17,7 @@ add_llvm_target(SPIRVCodeGen
   SPIRVAsmPrinter.cpp
   SPIRVBuiltins.cpp
   SPIRVCallLowering.cpp
+  SPIRVInlineAsmLowering.cpp
   SPIRVCommandLine.cpp
   SPIRVDuplicatesTracker.cpp
   SPIRVEmitIntrinsics.cpp
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
index b468b71cc0ef..5c286acdcc9b 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
@@ -321,14 +321,19 @@ void SPIRVInstPrinter::printStringImm(const MCInst *MI, unsigned OpNo,
     if (MI->getOperand(StrStartIndex).isReg())
       break;
 
-    std::string Str = getSPIRVStringOperand(*MI, OpNo);
+    std::string Str = getSPIRVStringOperand(*MI, StrStartIndex);
     if (StrStartIndex != OpNo)
       O << ' '; // Add a space if we're starting a new string/argument.
     O << '"';
     for (char c : Str) {
-      if (c == '"')
-        O.write('\\'); // Escape " characters (might break for complex UTF-8).
-      O.write(c);
+      // Escape ", \n characters (might break for complex UTF-8).
+      if (c == '\n') {
+        O.write("\\n", 2);
+      } else {
+        if (c == '"')
+          O.write('\\');
+        O.write(c);
+      }
     }
     O << '"';
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 9fde26c900f5..424087f361a6 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -1118,6 +1118,39 @@ static bool generateGroupUniformInst(const SPIRV::IncomingCall *Call,
   return true;
 }
 
+static bool generateKernelClockInst(const SPIRV::IncomingCall *Call,
+                                    MachineIRBuilder &MIRBuilder,
+                                    SPIRVGlobalRegistry *GR) {
+  const SPIRV::DemangledBuiltin *Builtin = Call->Builtin;
+  MachineFunction &MF = MIRBuilder.getMF();
+  const auto *ST = static_cast<const SPIRVSubtarget *>(&MF.getSubtarget());
+  if (!ST->canUseExtension(SPIRV::Extension::SPV_KHR_shader_clock)) {
+    std::string DiagMsg = std::string(Builtin->Name) +
+                          ": the builtin requires the following SPIR-V "
+                          "extension: SPV_KHR_shader_clock";
+    report_fatal_error(DiagMsg.c_str(), false);
+  }
+
+  MachineRegisterInfo *MRI = MIRBuilder.getMRI();
+  Register ResultReg = Call->ReturnRegister;
+  MRI->setRegClass(ResultReg, &SPIRV::IDRegClass);
+
+  // Deduce the `Scope` operand from the builtin function name.
+  SPIRV::Scope::Scope ScopeArg =
+      StringSwitch<SPIRV::Scope::Scope>(Builtin->Name)
+          .EndsWith("device", SPIRV::Scope::Scope::Device)
+          .EndsWith("work_group", SPIRV::Scope::Scope::Workgroup)
+          .EndsWith("sub_group", SPIRV::Scope::Scope::Subgroup);
+  Register ScopeReg = buildConstantIntReg(ScopeArg, MIRBuilder, GR);
+
+  MIRBuilder.buildInstr(SPIRV::OpReadClockKHR)
+      .addDef(ResultReg)
+      .addUse(GR->getSPIRVTypeID(Call->ReturnType))
+      .addUse(ScopeReg);
+
+  return true;
+}
+
 // These queries ask for a single size_t result for a given dimension index, e.g
 // size_t get_global_id(uint dimindex). In SPIR-V, the builtins corresonding to
 // these values are all vec3 types, so we need to extract the correct index or
@@ -2290,6 +2323,8 @@ std::optional<bool> lowerBuiltin(const StringRef DemangledCall,
     return generateIntelSubgroupsInst(Call.get(), MIRBuilder, GR);
   case SPIRV::GroupUniform:
     return generateGroupUniformInst(Call.get(), MIRBuilder, GR);
+  case SPIRV::KernelClock:
+    return generateKernelClockInst(Call.get(), MIRBuilder, GR);
   }
   return false;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 564028547821..692234c405ab 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -58,6 +58,7 @@ def LoadStore : BuiltinGroup;
 def IntelSubgroups : BuiltinGroup;
 def AtomicFloating : BuiltinGroup;
 def GroupUniform : BuiltinGroup;
+def KernelClock : BuiltinGroup;
 
 //===----------------------------------------------------------------------===//
 // Class defining a demangled builtin record. The information in the record
@@ -952,6 +953,14 @@ defm : DemangledGroupBuiltin<"group_scan_exclusive_logical_xor", OnlyWork, OpGro
 defm : DemangledGroupBuiltin<"group_scan_inclusive_logical_xor", OnlyWork, OpGroupLogicalXorKHR>;
 defm : DemangledGroupBuiltin<"group_reduce_logical_xor", OnlyWork, OpGroupLogicalXorKHR>;
 
+// cl_khr_kernel_clock / SPV_KHR_shader_clock
+defm : DemangledNativeBuiltin<"clock_read_device", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
+defm : DemangledNativeBuiltin<"clock_read_work_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
+defm : DemangledNativeBuiltin<"clock_read_sub_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
+defm : DemangledNativeBuiltin<"clock_read_hilo_device", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
+defm : DemangledNativeBuiltin<"clock_read_hilo_work_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
+defm : DemangledNativeBuiltin<"clock_read_hilo_sub_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
+
 //===----------------------------------------------------------------------===//
 // Class defining an atomic instruction on floating-point numbers.
 //
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 691e6ee0e582..7f531542544a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -47,6 +47,8 @@ static const std::map<std::string, SPIRV::Extension::Extension>
          SPIRV::Extension::Extension::SPV_KHR_bit_instructions},
         {"SPV_KHR_linkonce_odr",
          SPIRV::Extension::Extension::SPV_KHR_linkonce_odr},
+        {"SPV_INTEL_inline_assembly",
+         SPIRV::Extension::Extension::SPV_INTEL_inline_assembly},
         {"SPV_INTEL_bfloat16_conversion",
          SPIRV::Extension::Extension::SPV_INTEL_bfloat16_conversion},
         {"SPV_KHR_subgroup_rotate",
@@ -55,6 +57,8 @@ static const std::map<std::string, SPIRV::Extension::Extension>
          SPIRV::Extension::Extension::SPV_INTEL_variable_length_array},
         {"SPV_INTEL_function_pointers",
          SPIRV::Extension::Extension::SPV_INTEL_function_pointers},
+        {"SPV_KHR_shader_clock",
+         SPIRV::Extension::Extension::SPV_KHR_shader_clock},
 };
 
 bool SPIRVExtensionsParser::parse(cl::Option &O, llvm::StringRef ArgName,
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index a1a08c5c699b..ea53fe55e7ab 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -140,6 +140,7 @@ public:
   Instruction *visitAllocaInst(AllocaInst &I);
   Instruction *visitAtomicCmpXchgInst(AtomicCmpXchgInst &I);
   Instruction *visitUnreachableInst(UnreachableInst &I);
+  Instruction *visitCallInst(CallInst &I);
 
   StringRef getPassName() const override { return "SPIRV emit intrinsics"; }
 
@@ -629,6 +630,28 @@ void SPIRVEmitIntrinsics::preprocessCompositeConstants(IRBuilder<> &B) {
   }
 }
 
+Instruction *SPIRVEmitIntrinsics::visitCallInst(CallInst &Call) {
+  if (!Call.isInlineAsm())
+    return &Call;
+
+  const InlineAsm *IA = cast<InlineAsm>(Call.getCalledOperand());
+  LLVMContext &Ctx = F->getContext();
+
+  Constant *TyC = UndefValue::get(IA->getFunctionType());
+  MDString *ConstraintString = MDString::get(Ctx, IA->getConstraintString());
+  SmallVector<Value *> Args = {
+      MetadataAsValue::get(Ctx,
+                           MDNode::get(Ctx, ValueAsMetadata::getConstant(TyC))),
+      MetadataAsValue::get(Ctx, MDNode::get(Ctx, ConstraintString))};
+  for (unsigned OpIdx = 0; OpIdx < Call.arg_size(); OpIdx++)
+    Args.push_back(Call.getArgOperand(OpIdx));
+
+  IRBuilder<> B(Call.getParent());
+  B.SetInsertPoint(&Call);
+  B.CreateIntrinsic(Intrinsic::spv_inline_asm, {}, {Args});
+  return &Call;
+}
+
 Instruction *SPIRVEmitIntrinsics::visitSwitchInst(SwitchInst &I) {
   BasicBlock *ParentBB = I.getParent();
   IRBuilder<> B(ParentBB);
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
index 96b4a570a26b..2bd22bbd6316 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.cpp
@@ -82,6 +82,28 @@ bool SPIRVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   return false;
 }
 
+std::pair<unsigned, const TargetRegisterClass *>
+SPIRVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                                                  StringRef Constraint,
+                                                  MVT VT) const {
+  const TargetRegisterClass *RC = nullptr;
+  if (Constraint.starts_with("{"))
+    return std::make_pair(0u, RC);
+
+  if (VT.isFloatingPoint())
+    RC = VT.isVector() ? &SPIRV::vfIDRegClass
+                       : (VT.getScalarSizeInBits() > 32 ? &SPIRV::fID64RegClass
+                                                        : &SPIRV::fIDRegClass);
+  else if (VT.isInteger())
+    RC = VT.isVector() ? &SPIRV::vIDRegClass
+                       : (VT.getScalarSizeInBits() > 32 ? &SPIRV::ID64RegClass
+                                                        : &SPIRV::IDRegClass);
+  else
+    RC = &SPIRV::IDRegClass;
+
+  return std::make_pair(0u, RC);
+}
+
 // Insert a bitcast before the instruction to keep SPIR-V code valid
 // when there is a type mismatch between results and operand types.
 static void validatePtrTypes(const SPIRVSubtarget &STI,
diff --git a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
index 8c1de7d97d1a..6fc200abf462 100644
--- a/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
+++ b/llvm/lib/Target/SPIRV/SPIRVISelLowering.h
@@ -55,6 +55,15 @@ public:
                           MachineFunction &MF,
                           unsigned Intrinsic) const override;
 
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
+                               StringRef Constraint, MVT VT) const override;
+  unsigned
+  getNumRegisters(LLVMContext &Context, EVT VT,
+                  std::optional<MVT> RegisterVT = std::nullopt) const override {
+    return 1;
+  }
+
   // Call the default implementation and finalize target lowering by inserting
   // extra instructions required to preserve validity of SPIR-V code imposed by
   // the standard.
diff --git a/llvm/lib/Target/SPIRV/SPIRVInlineAsmLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVInlineAsmLowering.cpp
new file mode 100644
index 000000000000..8bd4fb6bf8b1
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVInlineAsmLowering.cpp
@@ -0,0 +1,46 @@
+//===--- SPIRVInlineAsmLowering.cpp - Inline Asm lowering -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the lowering of LLVM inline asm calls to machine code
+// calls for GlobalISel.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRVInlineAsmLowering.h"
+#include "SPIRVSubtarget.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsSPIRV.h"
+
+using namespace llvm;
+
+SPIRVInlineAsmLowering::SPIRVInlineAsmLowering(const SPIRVTargetLowering &TLI)
+    : InlineAsmLowering(&TLI) {}
+
+bool SPIRVInlineAsmLowering::lowerAsmOperandForConstraint(
+    Value *Val, StringRef Constraint, std::vector<MachineOperand> &Ops,
+    MachineIRBuilder &MIRBuilder) const {
+  Value *ValOp = nullptr;
+  if (isa<ConstantInt>(Val)) {
+    ValOp = Val;
+  } else if (ConstantFP *CFP = dyn_cast<ConstantFP>(Val)) {
+    Ops.push_back(MachineOperand::CreateFPImm(CFP));
+    return true;
+  } else if (auto *II = dyn_cast<IntrinsicInst>(Val)) {
+    if (II->getIntrinsicID() == Intrinsic::spv_track_constant) {
+      if (isa<ConstantInt>(II->getOperand(0))) {
+        ValOp = II->getOperand(0);
+      } else if (ConstantFP *CFP = dyn_cast<ConstantFP>(II->getOperand(0))) {
+        Ops.push_back(MachineOperand::CreateFPImm(CFP));
+        return true;
+      }
+    }
+  }
+  return ValOp ? InlineAsmLowering::lowerAsmOperandForConstraint(
+                     ValOp, Constraint, Ops, MIRBuilder)
+               : false;
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVInlineAsmLowering.h b/llvm/lib/Target/SPIRV/SPIRVInlineAsmLowering.h
new file mode 100644
index 000000000000..72291855a18c
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVInlineAsmLowering.h
@@ -0,0 +1,33 @@
+//===--- SPIRVInlineAsmLowering.h - Inline Asm lowering ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file  describes how to lower LLVM inline asm calls to machine
+// code calls for GlobalISel.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVINLINEASMLOWERING_H
+#define LLVM_LIB_TARGET_SPIRV_SPIRVINLINEASMLOWERING_H
+
+#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
+
+namespace llvm {
+
+class SPIRVTargetLowering;
+
+class SPIRVInlineAsmLowering : public InlineAsmLowering {
+public:
+  SPIRVInlineAsmLowering(const SPIRVTargetLowering &TLI);
+  bool
+  lowerAsmOperandForConstraint(Value *Val, StringRef Constraint,
+                               std::vector<MachineOperand> &Ops,
+                               MachineIRBuilder &MIRBuilder) const override;
+};
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_SPIRV_SPIRVINLINEASMLOWERING_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
index af98f2f88045..12cf7613a45c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
@@ -47,6 +47,16 @@ bool SPIRVInstrInfo::isConstantInstr(const MachineInstr &MI) const {
   }
 }
 
+bool SPIRVInstrInfo::isInlineAsmDefInstr(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case SPIRV::OpAsmTargetINTEL:
+  case SPIRV::OpAsmINTEL:
+    return true;
+  default:
+    return false;
+  }
+}
+
 bool SPIRVInstrInfo::isTypeDeclInstr(const MachineInstr &MI) const {
   auto &MRI = MI.getMF()->getRegInfo();
   if (MI.getNumDefs() >= 1 && MI.getOperand(0).isReg()) {
@@ -246,7 +256,8 @@ void SPIRVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 }
 
 bool SPIRVInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
-  if (MI.getOpcode() == SPIRV::GET_ID || MI.getOpcode() == SPIRV::GET_fID ||
+  if (MI.getOpcode() == SPIRV::GET_ID || MI.getOpcode() == SPIRV::GET_ID64 ||
+      MI.getOpcode() == SPIRV::GET_fID || MI.getOpcode() == SPIRV::GET_fID64 ||
       MI.getOpcode() == SPIRV::GET_pID32 ||
       MI.getOpcode() == SPIRV::GET_pID64 || MI.getOpcode() == SPIRV::GET_vfID ||
       MI.getOpcode() == SPIRV::GET_vID || MI.getOpcode() == SPIRV::GET_vpID32 ||
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
index 4f2781c9404b..95f387491357 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
@@ -30,6 +30,7 @@ public:
   const SPIRVRegisterInfo &getRegisterInfo() const { return RI; }
   bool isHeaderInstr(const MachineInstr &MI) const;
   bool isConstantInstr(const MachineInstr &MI) const;
+  bool isInlineAsmDefInstr(const MachineInstr &MI) const;
   bool isTypeDeclInstr(const MachineInstr &MI) const;
   bool isDecorationInstr(const MachineInstr &MI) const;
   bool canUseFastMathFlags(const MachineInstr &MI) const;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index 151d0ec1fe56..7c9b84a48a2a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -18,7 +18,9 @@ let isCodeGenOnly=1 in {
   def ASSIGN_TYPE: Pseudo<(outs ANYID:$dst_id), (ins ANYID:$src_id, TYPE:$src_ty)>;
   def DECL_TYPE: Pseudo<(outs ANYID:$dst_id), (ins ANYID:$src_id, TYPE:$src_ty)>;
   def GET_ID: Pseudo<(outs ID:$dst_id), (ins ANYID:$src)>;
+  def GET_ID64: Pseudo<(outs ID64:$dst_id), (ins ANYID:$src)>;
   def GET_fID: Pseudo<(outs fID:$dst_id), (ins ANYID:$src)>;
+  def GET_fID64: Pseudo<(outs fID64:$dst_id), (ins ANYID:$src)>;
   def GET_pID32: Pseudo<(outs pID32:$dst_id), (ins ANYID:$src)>;
   def GET_pID64: Pseudo<(outs pID64:$dst_id), (ins ANYID:$src)>;
   def GET_vID: Pseudo<(outs vID:$dst_id), (ins ANYID:$src)>;
@@ -802,6 +804,11 @@ def OpGroupNonUniformRotateKHR: Op<4431, (outs ID:$res),
                   (ins TYPE:$type, ID:$scope, ID:$value, ID:$delta, variable_ops),
                   "$res = OpGroupNonUniformRotateKHR $type $scope $value $delta">;
 
+// SPV_KHR_shader_clock
+def OpReadClockKHR: Op<5056, (outs ID:$res),
+                  (ins TYPE:$type, ID:$scope),
+                  "$res = OpReadClockKHR $type $scope">;
+
 // 3.49.7, Constant-Creation Instructions
 
 //  - SPV_INTEL_function_pointers
@@ -849,3 +856,11 @@ def OpGroupLogicalOrKHR: Op<6407, (outs ID:$res), (ins TYPE:$type, ID:$scope, i3
                   "$res = OpGroupLogicalOrKHR $type $scope $groupOp $value">;
 def OpGroupLogicalXorKHR: Op<6408, (outs ID:$res), (ins TYPE:$type, ID:$scope, i32imm:$groupOp, ID:$value),
                   "$res = OpGroupLogicalXorKHR $type $scope $groupOp $value">;
+
+// Inline Assembly Instructions
+def OpAsmTargetINTEL: Op<5609, (outs ID:$res), (ins StringImm:$str), "$res = OpAsmTargetINTEL $str">;
+def OpAsmINTEL: Op<5610, (outs ID:$res), (ins TYPE:$type, TYPE:$asm_type, ID:$target,
+                                          StringImm:$asm, StringImm:$constraints),
+                  "$res = OpAsmINTEL $type $asm_type $target $asm">;
+def OpAsmCallINTEL: Op<5611, (outs ID:$res), (ins TYPE:$type, ID:$asm, variable_ops),
+                  "$res = OpAsmCallINTEL $type $asm">;
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 235f947901d8..c86ab285f354 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1117,6 +1117,14 @@ void addInstrRequirements(const MachineInstr &MI,
       Reqs.addCapability(SPIRV::Capability::GroupUniformArithmeticKHR);
     }
     break;
+  case SPIRV::OpReadClockKHR:
+    if (!ST.canUseExtension(SPIRV::Extension::SPV_KHR_shader_clock))
+      report_fatal_error("OpReadClockKHR instruction requires the "
+                         "following SPIR-V extension: SPV_KHR_shader_clock",
+                         false);
+    Reqs.addExtension(SPIRV::Extension::SPV_KHR_shader_clock);
+    Reqs.addCapability(SPIRV::Capability::ShaderClockKHR);
+    break;
   case SPIRV::OpFunctionPointerCallINTEL:
     if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_function_pointers)) {
       Reqs.addExtension(SPIRV::Extension::SPV_INTEL_function_pointers);
@@ -1143,6 +1151,14 @@ void addInstrRequirements(const MachineInstr &MI,
       Reqs.addCapability(SPIRV::Capability::VariableLengthArrayINTEL);
     }
     break;
+  case SPIRV::OpAsmTargetINTEL:
+  case SPIRV::OpAsmINTEL:
+  case SPIRV::OpAsmCallINTEL:
+    if (ST.canUseExtension(SPIRV::Extension::SPV_INTEL_inline_assembly)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_inline_assembly);
+      Reqs.addCapability(SPIRV::Capability::AsmINTEL);
+    }
+    break;
   default:
     break;
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
index d652b5de6080..c3842f026670 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
@@ -54,7 +54,8 @@ extern void processInstr(MachineInstr &MI, MachineIRBuilder &MIB,
 } // namespace llvm
 
 static bool isMetaInstrGET(unsigned Opcode) {
-  return Opcode == SPIRV::GET_ID || Opcode == SPIRV::GET_fID ||
+  return Opcode == SPIRV::GET_ID || Opcode == SPIRV::GET_ID64 ||
+         Opcode == SPIRV::GET_fID || Opcode == SPIRV::GET_fID64 ||
          Opcode == SPIRV::GET_pID32 || Opcode == SPIRV::GET_pID64 ||
          Opcode == SPIRV::GET_vID || Opcode == SPIRV::GET_vfID ||
          Opcode == SPIRV::GET_vpID32 || Opcode == SPIRV::GET_vpID64;
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index 9bff23dd9666..85299a49a6b9 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -215,6 +215,8 @@ static SPIRVType *propagateSPIRVType(MachineInstr *MI, SPIRVGlobalRegistry *GR,
         SpirvTy = GR->getOrCreateSPIRVType(Ty, MIB);
         break;
       }
+      case TargetOpcode::G_ANYEXT:
+      case TargetOpcode::G_SEXT:
       case TargetOpcode::G_ZEXT: {
         if (MI->getOperand(1).isReg()) {
           if (MachineInstr *DefInstr =
@@ -457,12 +459,7 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
           Ty = VectorType::get(ElemTy, NumElts, false);
         }
         insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MRI);
-      } else if (MI.getOpcode() == TargetOpcode::G_TRUNC ||
-                 MI.getOpcode() == TargetOpcode::G_ZEXT ||
-                 MI.getOpcode() == TargetOpcode::G_PTRTOINT ||
-                 MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE ||
-                 MI.getOpcode() == TargetOpcode::COPY ||
-                 MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST) {
+      } else if (MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE) {
         propagateSPIRVType(&MI, GR, MRI, MIB);
       }
 
@@ -474,6 +471,24 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
   }
   for (MachineInstr *MI : ToErase)
     MI->eraseFromParent();
+
+  // Address the case when IRTranslator introduces instructions with new
+  // registers without SPIRVType associated.
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      switch (MI.getOpcode()) {
+      case TargetOpcode::G_TRUNC:
+      case TargetOpcode::G_ANYEXT:
+      case TargetOpcode::G_SEXT:
+      case TargetOpcode::G_ZEXT:
+      case TargetOpcode::G_PTRTOINT:
+      case TargetOpcode::COPY:
+      case TargetOpcode::G_ADDRSPACE_CAST:
+        propagateSPIRVType(&MI, GR, MRI, MIB);
+        break;
+      }
+    }
+  }
 }
 
 // Defined in SPIRVLegalizerInfo.cpp.
@@ -519,6 +534,128 @@ static void processInstrsWithTypeFolding(MachineFunction &MF,
   }
 }
 
+static void
+insertInlineAsmProcess(MachineFunction &MF, SPIRVGlobalRegistry *GR,
+                       const SPIRVSubtarget &ST, MachineIRBuilder MIRBuilder,
+                       const SmallVector<MachineInstr *> &ToProcess) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  Register AsmTargetReg;
+  for (unsigned i = 0, Sz = ToProcess.size(); i + 1 < Sz; i += 2) {
+    MachineInstr *I1 = ToProcess[i], *I2 = ToProcess[i + 1];
+    assert(isSpvIntrinsic(*I1, Intrinsic::spv_inline_asm) && I2->isInlineAsm());
+    MIRBuilder.setInsertPt(*I1->getParent(), *I1);
+
+    if (!AsmTargetReg.isValid()) {
+      // define vendor specific assembly target or dialect
+      AsmTargetReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
+      MRI.setRegClass(AsmTargetReg, &SPIRV::IDRegClass);
+      auto AsmTargetMIB =
+          MIRBuilder.buildInstr(SPIRV::OpAsmTargetINTEL).addDef(AsmTargetReg);
+      addStringImm(ST.getTargetTripleAsStr(), AsmTargetMIB);
+      GR->add(AsmTargetMIB.getInstr(), &MF, AsmTargetReg);
+    }
+
+    // create types
+    const MDNode *IAMD = I1->getOperand(1).getMetadata();
+    FunctionType *FTy = cast<FunctionType>(getMDOperandAsType(IAMD, 0));
+    SmallVector<SPIRVType *, 4> ArgTypes;
+    for (const auto &ArgTy : FTy->params())
+      ArgTypes.push_back(GR->getOrCreateSPIRVType(ArgTy, MIRBuilder));
+    SPIRVType *RetType =
+        GR->getOrCreateSPIRVType(FTy->getReturnType(), MIRBuilder);
+    SPIRVType *FuncType = GR->getOrCreateOpTypeFunctionWithArgs(
+        FTy, RetType, ArgTypes, MIRBuilder);
+
+    // define vendor specific assembly instructions string
+    Register AsmReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
+    MRI.setRegClass(AsmReg, &SPIRV::IDRegClass);
+    auto AsmMIB = MIRBuilder.buildInstr(SPIRV::OpAsmINTEL)
+                      .addDef(AsmReg)
+                      .addUse(GR->getSPIRVTypeID(RetType))
+                      .addUse(GR->getSPIRVTypeID(FuncType))
+                      .addUse(AsmTargetReg);
+    // inline asm string:
+    addStringImm(I2->getOperand(InlineAsm::MIOp_AsmString).getSymbolName(),
+                 AsmMIB);
+    // inline asm constraint string:
+    addStringImm(cast<MDString>(I1->getOperand(2).getMetadata()->getOperand(0))
+                     ->getString(),
+                 AsmMIB);
+    GR->add(AsmMIB.getInstr(), &MF, AsmReg);
+
+    // calls the inline assembly instruction
+    unsigned ExtraInfo = I2->getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+    if (ExtraInfo & InlineAsm::Extra_HasSideEffects)
+      MIRBuilder.buildInstr(SPIRV::OpDecorate)
+          .addUse(AsmReg)
+          .addImm(static_cast<uint32_t>(SPIRV::Decoration::SideEffectsINTEL));
+    Register DefReg;
+    SmallVector<unsigned, 4> Ops;
+    unsigned StartOp = InlineAsm::MIOp_FirstOperand,
+             AsmDescOp = InlineAsm::MIOp_FirstOperand;
+    unsigned I2Sz = I2->getNumOperands();
+    for (unsigned Idx = StartOp; Idx != I2Sz; ++Idx) {
+      const MachineOperand &MO = I2->getOperand(Idx);
+      if (MO.isMetadata())
+        continue;
+      if (Idx == AsmDescOp && MO.isImm()) {
+        // compute the index of the next operand descriptor
+        const InlineAsm::Flag F(MO.getImm());
+        AsmDescOp += 1 + F.getNumOperandRegisters();
+      } else {
+        if (MO.isReg() && MO.isDef())
+          DefReg = MO.getReg();
+        else
+          Ops.push_back(Idx);
+      }
+    }
+    if (!DefReg.isValid()) {
+      DefReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
+      MRI.setRegClass(DefReg, &SPIRV::IDRegClass);
+      SPIRVType *VoidType = GR->getOrCreateSPIRVType(
+          Type::getVoidTy(MF.getFunction().getContext()), MIRBuilder);
+      GR->assignSPIRVTypeToVReg(VoidType, DefReg, MF);
+    }
+    auto AsmCall = MIRBuilder.buildInstr(SPIRV::OpAsmCallINTEL)
+                       .addDef(DefReg)
+                       .addUse(GR->getSPIRVTypeID(RetType))
+                       .addUse(AsmReg);
+    unsigned IntrIdx = 2;
+    for (unsigned Idx : Ops) {
+      ++IntrIdx;
+      const MachineOperand &MO = I2->getOperand(Idx);
+      if (MO.isReg())
+        AsmCall.addUse(MO.getReg());
+      else
+        AsmCall.addUse(I1->getOperand(IntrIdx).getReg());
+    }
+  }
+  for (MachineInstr *MI : ToProcess)
+    MI->eraseFromParent();
+}
+
+static void insertInlineAsm(MachineFunction &MF, SPIRVGlobalRegistry *GR,
+                            const SPIRVSubtarget &ST,
+                            MachineIRBuilder MIRBuilder) {
+  SmallVector<MachineInstr *> ToProcess;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (isSpvIntrinsic(MI, Intrinsic::spv_inline_asm) ||
+          MI.getOpcode() == TargetOpcode::INLINEASM)
+        ToProcess.push_back(&MI);
+    }
+  }
+  if (ToProcess.size() == 0)
+    return;
+
+  if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_inline_assembly))
+    report_fatal_error("Inline assembly instructions require the "
+                       "following SPIR-V extension: SPV_INTEL_inline_assembly",
+                       false);
+
+  insertInlineAsmProcess(MF, GR, ST, MIRBuilder, ToProcess);
+}
+
 static void insertSpirvDecorations(MachineFunction &MF, MachineIRBuilder MIB) {
   SmallVector<MachineInstr *, 10> ToErase;
   for (MachineBasicBlock &MBB : MF) {
@@ -673,6 +810,7 @@ bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) {
   processInstrsWithTypeFolding(MF, GR, MIB);
   removeImplicitFallthroughs(MF, MIB);
   insertSpirvDecorations(MF, MIB);
+  insertInlineAsm(MF, GR, ST, MIB);
 
   return true;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVRegisterBanks.td b/llvm/lib/Target/SPIRV/SPIRVRegisterBanks.td
index dea2ef402d3d..e81d96956404 100644
--- a/llvm/lib/Target/SPIRV/SPIRVRegisterBanks.td
+++ b/llvm/lib/Target/SPIRV/SPIRVRegisterBanks.td
@@ -10,4 +10,4 @@
 // as InstructionSelector RegClass checking code relies on them
 
 def TYPERegBank : RegisterBank<"TYPEBank", [TYPE]>;
-def IDRegBank : RegisterBank<"IDBank", [ID, fID, pID32, pID64, vID, vfID, vpID32, vpID64]>;
+def IDRegBank : RegisterBank<"IDBank", [ID, ID64, fID, fID64, pID32, pID64, vID, vfID, vpID32, vpID64]>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td b/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td
index 9231d22e8d83..17f6ba59cc5d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td
@@ -29,7 +29,9 @@ let Namespace = "SPIRV" in {
 
   // Class for non-type registers
   def ID0 : Register<"ID0">;
+  def ID640 : Register<"ID640">;
   def fID0 : Register<"fID0">;
+  def fID640 : Register<"fID640">;
   def pID320 : Register<"pID320">;
   def pID640 : Register<"pID640">;
   def vID0 : Register<"vID0">;
@@ -38,7 +40,9 @@ let Namespace = "SPIRV" in {
   def vpID640 : Register<"vpID640">;
 
   def ID : RegisterClass<"SPIRV", [i32], 32, (add ID0)>;
+  def ID64 : RegisterClass<"SPIRV", [i64], 32, (add ID640)>;
   def fID : RegisterClass<"SPIRV", [f32], 32, (add fID0)>;
+  def fID64 : RegisterClass<"SPIRV", [f64], 32, (add fID640)>;
   def pID32 : RegisterClass<"SPIRV", [p32], 32, (add pID320)>;
   def pID64 : RegisterClass<"SPIRV", [p64], 32, (add pID640)>;
   def vID : RegisterClass<"SPIRV", [v2i32], 32, (add vID0)>;
@@ -48,9 +52,9 @@ let Namespace = "SPIRV" in {
 
   def ANYID : RegisterClass<
       "SPIRV",
-      [i32, f32, p32, p64, v2i32, v2f32, v2p32, v2p64],
+      [i32, i64, f32, f64, p32, p64, v2i32, v2f32, v2p32, v2p64],
       32,
-      (add ID0, fID0, pID320, pID640, vID0, vfID0, vpID320, vpID640)>;
+      (add ID0, ID640, fID0, fID640, pID320, pID640, vID0, vfID0, vpID320, vpID640)>;
 
   // A few instructions like OpName can take ids from both type and non-type
   // instructions, so we need a super-class to allow for both to count as valid
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index 7aa0c566c75f..27472923ee08 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -82,6 +82,7 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU,
 
   GR = std::make_unique<SPIRVGlobalRegistry>(PointerSize);
   CallLoweringInfo = std::make_unique<SPIRVCallLowering>(TLInfo, GR.get());
+  InlineAsmInfo = std::make_unique<SPIRVInlineAsmLowering>(TLInfo);
   Legalizer = std::make_unique<SPIRVLegalizerInfo>(*this);
   RegBankInfo = std::make_unique<SPIRVRegisterBankInfo>();
   InstSelector.reset(
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.h b/llvm/lib/Target/SPIRV/SPIRVSubtarget.h
index 3e4044084266..211216488db7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.h
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.h
@@ -16,6 +16,7 @@
 #include "SPIRVCallLowering.h"
 #include "SPIRVFrameLowering.h"
 #include "SPIRVISelLowering.h"
+#include "SPIRVInlineAsmLowering.h"
 #include "SPIRVInstrInfo.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
@@ -54,6 +55,7 @@ private:
   std::unique_ptr<RegisterBankInfo> RegBankInfo;
   std::unique_ptr<LegalizerInfo> Legalizer;
   std::unique_ptr<InstructionSelector> InstSelector;
+  std::unique_ptr<InlineAsmLowering> InlineAsmInfo;
 
   // TODO: Initialise the available extensions, extended instruction sets
   // based on the environment settings.
@@ -81,6 +83,7 @@ public:
            TargetTriple.getArch() == Triple::spirv64;
   }
   bool isVulkanEnv() const { return TargetTriple.getArch() == Triple::spirv; }
+  const std::string &getTargetTripleAsStr() const { return TargetTriple.str(); }
   VersionTuple getSPIRVVersion() const { return SPIRVVersion; };
   bool isAtLeastSPIRVVer(VersionTuple VerToCompareTo) const;
   bool isAtLeastOpenCLVer(VersionTuple VerToCompareTo) const;
@@ -108,6 +111,9 @@ public:
   InstructionSelector *getInstructionSelector() const override {
     return InstSelector.get();
   }
+  const InlineAsmLowering *getInlineAsmLowering() const override {
+    return InlineAsmInfo.get();
+  }
   const SPIRVInstrInfo *getInstrInfo() const override { return &InstrInfo; }
   const SPIRVFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 31e19ad8630c..98cbd9d2c1f2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -298,6 +298,7 @@ defm SPV_INTEL_optnone : ExtensionOperand<103>;
 defm SPV_INTEL_function_pointers : ExtensionOperand<104>;
 defm SPV_INTEL_variable_length_array : ExtensionOperand<105>;
 defm SPV_INTEL_bfloat16_conversion : ExtensionOperand<106>;
+defm SPV_INTEL_inline_assembly : ExtensionOperand<107>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Capabilities enum values and at the same time
@@ -413,6 +414,7 @@ defm ImageGatherBiasLodAMD : CapabilityOperand<5009, 0, 0, [], [Shader]>;
 defm FragmentMaskAMD : CapabilityOperand<5010, 0, 0, [], [Shader]>;
 defm StencilExportEXT : CapabilityOperand<5013, 0, 0, [], [Shader]>;
 defm ImageReadWriteLodAMD : CapabilityOperand<5015, 0, 0, [], [Shader]>;
+defm ShaderClockKHR : CapabilityOperand<5055, 0, 0, [SPV_KHR_shader_clock], []>;
 defm SampleMaskOverrideCoverageNV : CapabilityOperand<5249, 0, 0, [], [SampleRateShading]>;
 defm GeometryShaderPassthroughNV : CapabilityOperand<5251, 0, 0, [], [Geometry]>;
 defm ShaderViewportIndexLayerEXT : CapabilityOperand<5254, 0, 0, [], [MultiViewport]>;
@@ -457,6 +459,7 @@ defm BitInstructions : CapabilityOperand<6025, 0, 0, [SPV_KHR_bit_instructions],
 defm ExpectAssumeKHR : CapabilityOperand<5629, 0, 0, [SPV_KHR_expect_assume], []>;
 defm FunctionPointersINTEL : CapabilityOperand<5603, 0, 0, [SPV_INTEL_function_pointers], []>;
 defm IndirectReferencesINTEL : CapabilityOperand<5604, 0, 0, [SPV_INTEL_function_pointers], []>;
+defm AsmINTEL : CapabilityOperand<5606, 0, 0, [SPV_INTEL_inline_assembly], []>;
 defm GroupNonUniformRotateKHR : CapabilityOperand<6026, 0, 0, [SPV_KHR_subgroup_rotate], [GroupNonUniform]>;
 defm AtomicFloat32AddEXT : CapabilityOperand<6033, 0, 0, [SPV_EXT_shader_atomic_float_add], []>;
 defm AtomicFloat64AddEXT : CapabilityOperand<6034, 0, 0, [SPV_EXT_shader_atomic_float_add], []>;
@@ -1200,6 +1203,8 @@ defm UserSemantic : DecorationOperand<5635, 0, 0, [], []>;
 defm RestrictPointerEXT : DecorationOperand<5355, 0, 0, [], [PhysicalStorageBufferAddressesEXT]>;
 defm AliasedPointerEXT : DecorationOperand<5356, 0, 0, [], [PhysicalStorageBufferAddressesEXT]>;
 defm ReferencedIndirectlyINTEL : DecorationOperand<5602, 0, 0, [], [IndirectReferencesINTEL]>;
+defm ClobberINTEL : DecorationOperand<5607, 0, 0, [SPV_INTEL_inline_assembly], [AsmINTEL]>;
+defm SideEffectsINTEL : DecorationOperand<5608, 0, 0, [SPV_INTEL_inline_assembly], [AsmINTEL]>;
 defm ArgumentAttributeINTEL : DecorationOperand<6409, 0, 0, [], [FunctionPointersINTEL]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/VE/VVPNodes.def b/llvm/lib/Target/VE/VVPNodes.def
index a60588672293..da1e4836fc4f 100644
--- a/llvm/lib/Target/VE/VVPNodes.def
+++ b/llvm/lib/Target/VE/VVPNodes.def
@@ -99,8 +99,8 @@ ADD_BINARY_VVP_OP_COMPACT(MUL)
 ADD_BINARY_VVP_OP_COMPACT(UDIV)
 ADD_BINARY_VVP_OP_COMPACT(SDIV)
 
-ADD_BINARY_VVP_OP(VVP_SRA,VP_ASHR,SRA) REGISTER_PACKED(VVP_SRA)
-ADD_BINARY_VVP_OP(VVP_SRL,VP_LSHR,SRL) REGISTER_PACKED(VVP_SRL)
+ADD_BINARY_VVP_OP(VVP_SRA,VP_SRA,SRA) REGISTER_PACKED(VVP_SRA)
+ADD_BINARY_VVP_OP(VVP_SRL,VP_SRL,SRL) REGISTER_PACKED(VVP_SRL)
 ADD_BINARY_VVP_OP_COMPACT(SHL) REGISTER_PACKED(VVP_SHL)
 
 ADD_BINARY_VVP_OP_COMPACT(AND) REGISTER_PACKED(VVP_AND)
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index d4e9fb057c44..34502170a5c7 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -345,6 +345,8 @@ inline bool isArgument(unsigned Opc) {
   case WebAssembly::ARGUMENT_v4i32_S:
   case WebAssembly::ARGUMENT_v2i64:
   case WebAssembly::ARGUMENT_v2i64_S:
+  case WebAssembly::ARGUMENT_v8f16:
+  case WebAssembly::ARGUMENT_v8f16_S:
   case WebAssembly::ARGUMENT_v4f32:
   case WebAssembly::ARGUMENT_v4f32_S:
   case WebAssembly::ARGUMENT_v2f64:
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
index fac2e0d935f5..867953b4e8d7 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyTypeUtilities.cpp
@@ -50,6 +50,7 @@ wasm::ValType WebAssembly::toValType(MVT Type) {
   case MVT::v8i16:
   case MVT::v4i32:
   case MVT::v2i64:
+  case MVT::v8f16:
   case MVT::v4f32:
   case MVT::v2f64:
     return wasm::ValType::V128;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 3524abba8990..443558537da2 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -62,7 +62,7 @@ MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const {
   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
   const TargetRegisterClass *TRC = MRI->getRegClass(RegNo);
   for (MVT T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64, MVT::v16i8, MVT::v8i16,
-                MVT::v4i32, MVT::v2i64, MVT::v4f32, MVT::v2f64})
+                MVT::v4i32, MVT::v2i64, MVT::v4f32, MVT::v2f64, MVT::v8f16})
     if (TRI->isTypeLegalForClass(*TRC, T))
       return T;
   LLVM_DEBUG(errs() << "Unknown type for register number: " << RegNo);
@@ -662,6 +662,8 @@ void WebAssemblyAsmPrinter::emitInstruction(const MachineInstr *MI) {
   case WebAssembly::ARGUMENT_v4f32_S:
   case WebAssembly::ARGUMENT_v2f64:
   case WebAssembly::ARGUMENT_v2f64_S:
+  case WebAssembly::ARGUMENT_v8f16:
+  case WebAssembly::ARGUMENT_v8f16_S:
     // These represent values which are live into the function entry, so there's
     // no instruction to emit.
     break;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 1c62290704fe..26e13948bc9a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -885,18 +885,6 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
       Table->setNoStrip();
       MIB.addImm(0);
     }
-    // See if we must truncate the function pointer.
-    // CALL_INDIRECT takes an i32, but in wasm64 we represent function pointers
-    // as 64-bit for uniformity with other pointer types.
-    // See also: WebAssemblyISelLowering.cpp: LowerCallResults
-    if (Subtarget->hasAddr64()) {
-      auto Wrap = BuildMI(*FuncInfo.MBB, std::prev(FuncInfo.InsertPt), MIMD,
-                          TII.get(WebAssembly::I32_WRAP_I64));
-      Register Reg32 = createResultReg(&WebAssembly::I32RegClass);
-      Wrap.addReg(Reg32, RegState::Define);
-      Wrap.addReg(CalleeReg);
-      CalleeReg = Reg32;
-    }
   }
 
   for (unsigned ArgReg : Args)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 527bb4c9fbea..518b6932a0c8 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -70,6 +70,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     addRegisterClass(MVT::v2i64, &WebAssembly::V128RegClass);
     addRegisterClass(MVT::v2f64, &WebAssembly::V128RegClass);
   }
+  if (Subtarget->hasHalfPrecision()) {
+    addRegisterClass(MVT::v8f16, &WebAssembly::V128RegClass);
+  }
   if (Subtarget->hasReferenceTypes()) {
     addRegisterClass(MVT::externref, &WebAssembly::EXTERNREFRegClass);
     addRegisterClass(MVT::funcref, &WebAssembly::FUNCREFRegClass);
@@ -576,20 +579,6 @@ LowerCallResults(MachineInstr &CallResults, DebugLoc DL, MachineBasicBlock *BB,
   const MCInstrDesc &MCID = TII.get(CallOp);
   MachineInstrBuilder MIB(MF, MF.CreateMachineInstr(MCID, DL));
 
-  // See if we must truncate the function pointer.
-  // CALL_INDIRECT takes an i32, but in wasm64 we represent function pointers
-  // as 64-bit for uniformity with other pointer types.
-  // See also: WebAssemblyFastISel::selectCall
-  if (IsIndirect && MF.getSubtarget<WebAssemblySubtarget>().hasAddr64()) {
-    Register Reg32 =
-        MF.getRegInfo().createVirtualRegister(&WebAssembly::I32RegClass);
-    auto &FnPtr = CallParams.getOperand(0);
-    BuildMI(*BB, CallResults.getIterator(), DL,
-            TII.get(WebAssembly::I32_WRAP_I64), Reg32)
-        .addReg(FnPtr.getReg());
-    FnPtr.setReg(Reg32);
-  }
-
   // Move the function pointer to the end of the arguments for indirect calls
   if (IsIndirect) {
     auto FnPtr = CallParams.getOperand(0);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index af95dfa25a18..558e3d859dcd 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -38,6 +38,13 @@ multiclass RELAXED_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
                             asmstr_s, simdop, HasRelaxedSIMD>;
 }
 
+multiclass HALF_PRECISION_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
+                            list<dag> pattern_r, string asmstr_r = "",
+                            string asmstr_s = "", bits<32> simdop = -1> {
+  defm "" : ABSTRACT_SIMD_I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r,
+                            asmstr_s, simdop, HasHalfPrecision>;
+}
+
 
 defm "" : ARGUMENT<V128, v16i8>;
 defm "" : ARGUMENT<V128, v8i16>;
@@ -45,6 +52,7 @@ defm "" : ARGUMENT<V128, v4i32>;
 defm "" : ARGUMENT<V128, v2i64>;
 defm "" : ARGUMENT<V128, v4f32>;
 defm "" : ARGUMENT<V128, v2f64>;
+defm "" : ARGUMENT<V128, v8f16>;
 
 // Constrained immediate argument types. Allow any value from the minimum signed
 // value to the maximum unsigned value for the lane size.
@@ -591,6 +599,14 @@ defm "" : Splat<I64x2, 18>;
 defm "" : Splat<F32x4, 19>;
 defm "" : Splat<F64x2, 20>;
 
+// Half values are not fully supported so an intrinsic is used instead of a
+// regular Splat pattern as above.
+defm SPLAT_F16x8 :
+  HALF_PRECISION_I<(outs V128:$dst), (ins F32:$x),
+                   (outs), (ins),
+                   [(set (v8f16 V128:$dst), (int_wasm_splat_f16x8 F32:$x))],
+                   "f16x8.splat\t$dst, $x", "f16x8.splat", 0x120>;
+
 // scalar_to_vector leaves high lanes undefined, so can be a splat
 foreach vec = AllVecs in
 def : Pat<(vec.vt (scalar_to_vector (vec.lane_vt vec.lane_rc:$x))),
@@ -644,6 +660,14 @@ def : Pat<
   (and (vector_extract (v8i16 V128:$vec), (i32 LaneIdx8:$idx)), (i32 0xffff)),
   (EXTRACT_LANE_I16x8_u $vec, imm:$idx)>;
 
+defm EXTRACT_LANE_F16x8 :
+  HALF_PRECISION_I<(outs F32:$dst), (ins V128:$vec, vec_i8imm_op:$idx),
+                   (outs), (ins vec_i8imm_op:$idx),
+                   [(set (f32 F32:$dst), (int_wasm_extract_lane_f16x8
+                    (v8f16 V128:$vec), (i32 LaneIdx16:$idx)))],
+                   "f16x8.extract_lane\t$dst, $vec, $idx",
+                   "f16x8.extract_lane\t$idx", 0x121>;
+
 // Replace lane value: replace_lane
 multiclass ReplaceLane<Vec vec, bits<32> simdop> {
   defm REPLACE_LANE_#vec :
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
index ba2936b492a9..4e2faa608be0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@@ -63,7 +63,8 @@ def I32 : WebAssemblyRegClass<[i32], 32, (add FP32, SP32, I32_0)>;
 def I64 : WebAssemblyRegClass<[i64], 64, (add FP64, SP64, I64_0)>;
 def F32 : WebAssemblyRegClass<[f32], 32, (add F32_0)>;
 def F64 : WebAssemblyRegClass<[f64], 64, (add F64_0)>;
-def V128 : WebAssemblyRegClass<[v4f32, v2f64, v2i64, v4i32, v16i8, v8i16], 128,
-                               (add V128_0)>;
+def V128 : WebAssemblyRegClass<[v8f16, v4f32, v2f64, v2i64, v4i32, v16i8,
+                                v8i16],
+                               128, (add V128_0)>;
 def FUNCREF : WebAssemblyRegClass<[funcref], 0, (add FUNCREF_0)>;
 def EXTERNREF : WebAssemblyRegClass<[externref], 0, (add EXTERNREF_0)>;
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 54642ecde18c..7e8133e3e1ac 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -124,24 +124,15 @@ def FeatureEVEX512  : SubtargetFeature<"evex512", "HasEVEX512", "true",
 def FeatureAVX512   : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512",
                                       "Enable AVX-512 instructions",
                                       [FeatureAVX2, FeatureFMA, FeatureF16C]>;
-def FeatureERI      : SubtargetFeature<"avx512er", "HasERI", "true",
-                      "Enable AVX-512 Exponential and Reciprocal Instructions",
-                                      [FeatureAVX512]>;
 def FeatureCDI      : SubtargetFeature<"avx512cd", "HasCDI", "true",
                       "Enable AVX-512 Conflict Detection Instructions",
                                       [FeatureAVX512]>;
 def FeatureVPOPCNTDQ : SubtargetFeature<"avx512vpopcntdq", "HasVPOPCNTDQ",
                        "true", "Enable AVX-512 Population Count Instructions",
                                       [FeatureAVX512]>;
-def FeaturePFI      : SubtargetFeature<"avx512pf", "HasPFI", "true",
-                      "Enable AVX-512 PreFetch Instructions",
-                                      [FeatureAVX512]>;
 def FeaturePREFETCHI  : SubtargetFeature<"prefetchi", "HasPREFETCHI",
                                    "true",
                                    "Prefetch instruction with T0 or T1 Hint">;
-def FeaturePREFETCHWT1  : SubtargetFeature<"prefetchwt1", "HasPREFETCHWT1",
-                                   "true",
-                                   "Prefetch with Intent to Write and T1 Hint">;
 def FeatureDQI     : SubtargetFeature<"avx512dq", "HasDQI", "true",
                       "Enable AVX-512 Doubleword and Quadword Instructions",
                                       [FeatureAVX512]>;
@@ -1312,10 +1303,7 @@ def ProcessorFeatures {
                                         FeatureFSGSBase,
                                         FeatureAVX512,
                                         FeatureEVEX512,
-                                        FeatureERI,
                                         FeatureCDI,
-                                        FeaturePFI,
-                                        FeaturePREFETCHWT1,
                                         FeatureADX,
                                         FeatureRDSEED,
                                         FeatureMOVBE,
diff --git a/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
index db1d21b59a7b..a0c91d4e3c3d 100644
--- a/llvm/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -53,7 +53,6 @@
 #include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/Passes.h"
@@ -113,8 +112,6 @@ public:
   FixupBWInstPass() : MachineFunctionPass(ID) { }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineLoopInfo>(); // Machine loop info is used to
-                                       // guide some heuristics.
     AU.addRequired<ProfileSummaryInfoWrapperPass>();
     AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
@@ -141,9 +138,6 @@ private:
   /// Local member for function's OptForSize attribute.
   bool OptForSize = false;
 
-  /// Machine loop info used for guiding some heruistics.
-  MachineLoopInfo *MLI = nullptr;
-
   /// Register Liveness information after the current instruction.
   LiveRegUnits LiveUnits;
 
@@ -164,7 +158,6 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) {
   this->MF = &MF;
   TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
   TRI = MF.getRegInfo().getTargetRegisterInfo();
-  MLI = &getAnalysis<MachineLoopInfo>();
   PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   MBFI = (PSI && PSI->hasProfileSummary()) ?
          &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index ea5ef5b5a602..80ff98b46617 100644
--- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -73,7 +73,7 @@ using CondRegArray = std::array<unsigned, X86::LAST_VALID_COND + 1>;
 
 class X86FlagsCopyLoweringPass : public MachineFunctionPass {
 public:
-  X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) { }
+  X86FlagsCopyLoweringPass() : MachineFunctionPass(ID) {}
 
   StringRef getPassName() const override { return "X86 EFLAGS copy lowering"; }
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -102,32 +102,14 @@ private:
   void insertTest(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos,
                   const DebugLoc &Loc, unsigned Reg);
 
-  void rewriteArithmetic(MachineBasicBlock &TestMBB,
-                         MachineBasicBlock::iterator TestPos,
-                         const DebugLoc &TestLoc, MachineInstr &MI,
-                         MachineOperand &FlagUse, CondRegArray &CondRegs);
-  void rewriteCMov(MachineBasicBlock &TestMBB,
-                   MachineBasicBlock::iterator TestPos, const DebugLoc &TestLoc,
-                   MachineInstr &CMovI, MachineOperand &FlagUse,
-                   CondRegArray &CondRegs);
-  void rewriteFCMov(MachineBasicBlock &TestMBB,
-                    MachineBasicBlock::iterator TestPos,
-                    const DebugLoc &TestLoc, MachineInstr &CMovI,
-                    MachineOperand &FlagUse, CondRegArray &CondRegs);
-  void rewriteCondJmp(MachineBasicBlock &TestMBB,
-                      MachineBasicBlock::iterator TestPos,
-                      const DebugLoc &TestLoc, MachineInstr &JmpI,
-                      CondRegArray &CondRegs);
-  void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse,
-                   MachineInstr &CopyDefI);
-  void rewriteSetCC(MachineBasicBlock &TestMBB,
-                    MachineBasicBlock::iterator TestPos,
-                    const DebugLoc &TestLoc, MachineInstr &SetCCI,
-                    MachineOperand &FlagUse, CondRegArray &CondRegs);
-  void rewriteCCMP(MachineBasicBlock &TestMBB,
-                   MachineBasicBlock::iterator TestPos, const DebugLoc &TestLoc,
-                   MachineInstr &CMovI, MachineOperand &FlagUse,
-                   CondRegArray &CondRegs);
+  void rewriteSetCC(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos,
+                    const DebugLoc &Loc, MachineInstr &MI,
+                    CondRegArray &CondRegs);
+  void rewriteArithmetic(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator Pos, const DebugLoc &Loc,
+                         MachineInstr &MI, CondRegArray &CondRegs);
+  void rewriteMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos,
+                 const DebugLoc &Loc, MachineInstr &MI, CondRegArray &CondRegs);
 };
 
 } // end anonymous namespace
@@ -148,85 +130,9 @@ void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-namespace {
-/// An enumeration of the arithmetic instruction mnemonics which have
-/// interesting flag semantics.
-///
-/// We can map instruction opcodes into these mnemonics to make it easy to
-/// dispatch with specific functionality.
-enum class FlagArithMnemonic {
-  ADC,
-  RCL,
-  RCR,
-  SBB,
-  SETB,
-};
-} // namespace
-
-static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    report_fatal_error("No support for lowering a copy into EFLAGS when used "
-                       "by this instruction!");
-
-#define CASE_ND(OP)                                                            \
-  case X86::OP:                                                                \
-  case X86::OP##_ND:
-
-#define LLVM_EXPAND_INSTR_SIZES(MNEMONIC, SUFFIX)                              \
-  CASE_ND(MNEMONIC##8##SUFFIX)                                                 \
-  CASE_ND(MNEMONIC##16##SUFFIX)                                                \
-  CASE_ND(MNEMONIC##32##SUFFIX)                                                \
-  CASE_ND(MNEMONIC##64##SUFFIX)
-
-#define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC)                                    \
-  LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr)                                        \
-  LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm)                                        \
-  LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr)                                        \
-  CASE_ND(MNEMONIC##8ri)                                                       \
-  CASE_ND(MNEMONIC##16ri8)                                                     \
-  CASE_ND(MNEMONIC##32ri8)                                                     \
-  CASE_ND(MNEMONIC##64ri8)                                                     \
-  CASE_ND(MNEMONIC##16ri)                                                      \
-  CASE_ND(MNEMONIC##32ri)                                                      \
-  CASE_ND(MNEMONIC##64ri32)                                                    \
-  CASE_ND(MNEMONIC##8mi)                                                       \
-  CASE_ND(MNEMONIC##16mi8)                                                     \
-  CASE_ND(MNEMONIC##32mi8)                                                     \
-  CASE_ND(MNEMONIC##64mi8)                                                     \
-  CASE_ND(MNEMONIC##16mi)                                                      \
-  CASE_ND(MNEMONIC##32mi)                                                      \
-  CASE_ND(MNEMONIC##64mi32)                                                    \
-  case X86::MNEMONIC##8i8:                                                     \
-  case X86::MNEMONIC##16i16:                                                   \
-  case X86::MNEMONIC##32i32:                                                   \
-  case X86::MNEMONIC##64i32:
-
-    LLVM_EXPAND_ADC_SBB_INSTR(ADC)
-    return FlagArithMnemonic::ADC;
-
-    LLVM_EXPAND_ADC_SBB_INSTR(SBB)
-    return FlagArithMnemonic::SBB;
-
-#undef LLVM_EXPAND_ADC_SBB_INSTR
-
-    LLVM_EXPAND_INSTR_SIZES(RCL, rCL)
-    LLVM_EXPAND_INSTR_SIZES(RCL, r1)
-    LLVM_EXPAND_INSTR_SIZES(RCL, ri)
-    return FlagArithMnemonic::RCL;
-
-    LLVM_EXPAND_INSTR_SIZES(RCR, rCL)
-    LLVM_EXPAND_INSTR_SIZES(RCR, r1)
-    LLVM_EXPAND_INSTR_SIZES(RCR, ri)
-    return FlagArithMnemonic::RCR;
-
-#undef LLVM_EXPAND_INSTR_SIZES
-#undef CASE_ND
-
-  case X86::SETB_C32r:
-  case X86::SETB_C64r:
-    return FlagArithMnemonic::SETB;
-  }
+static bool isArithmeticOp(unsigned Opc) {
+  return X86::isADC(Opc) || X86::isSBB(Opc) || X86::isRCL(Opc) ||
+         X86::isRCR(Opc) || (Opc == X86::SETB_C32r || Opc == X86::SETB_C64r);
 }
 
 static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB,
@@ -329,28 +235,6 @@ static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB,
   return NewMBB;
 }
 
-static X86::CondCode getCondFromFCMOV(unsigned Opcode) {
-  switch (Opcode) {
-  default: return X86::COND_INVALID;
-  case X86::CMOVBE_Fp32:  case X86::CMOVBE_Fp64:  case X86::CMOVBE_Fp80:
-    return X86::COND_BE;
-  case X86::CMOVB_Fp32:   case X86::CMOVB_Fp64:   case X86::CMOVB_Fp80:
-    return X86::COND_B;
-  case X86::CMOVE_Fp32:   case X86::CMOVE_Fp64:   case X86::CMOVE_Fp80:
-    return X86::COND_E;
-  case X86::CMOVNBE_Fp32: case X86::CMOVNBE_Fp64: case X86::CMOVNBE_Fp80:
-    return X86::COND_A;
-  case X86::CMOVNB_Fp32:  case X86::CMOVNB_Fp64:  case X86::CMOVNB_Fp80:
-    return X86::COND_AE;
-  case X86::CMOVNE_Fp32:  case X86::CMOVNE_Fp64:  case X86::CMOVNE_Fp80:
-    return X86::COND_NE;
-  case X86::CMOVNP_Fp32:  case X86::CMOVNP_Fp64:  case X86::CMOVNP_Fp80:
-    return X86::COND_NP;
-  case X86::CMOVP_Fp32:   case X86::CMOVP_Fp64:   case X86::CMOVP_Fp80:
-    return X86::COND_P;
-  }
-}
-
 bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
                     << " **********\n");
@@ -362,7 +246,7 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
   MDT = &getAnalysis<MachineDominatorTree>();
   PromoteRC = &X86::GR8RegClass;
 
-  if (MF.begin() == MF.end())
+  if (MF.empty())
     // Nothing to do for a degenerate empty function...
     return false;
 
@@ -569,20 +453,12 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
 
         MachineOperand *FlagUse =
             MI.findRegisterUseOperand(X86::EFLAGS, /*TRI=*/nullptr);
-        if (!FlagUse) {
-          if (MI.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr)) {
-            // If EFLAGS are defined, it's as-if they were killed. We can stop
-            // scanning here.
-            //
-            // NB!!! Many instructions only modify some flags. LLVM currently
-            // models this as clobbering all flags, but if that ever changes
-            // this will need to be carefully updated to handle that more
-            // complex logic.
-            FlagsKilled = true;
-            break;
-          }
+        FlagsKilled = MI.modifiesRegister(X86::EFLAGS, TRI);
+
+        if (!FlagUse && FlagsKilled)
+          break;
+        else if (!FlagUse)
           continue;
-        }
 
         LLVM_DEBUG(dbgs() << "  Rewriting use: "; MI.dump());
 
@@ -604,40 +480,23 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
             JmpIs.push_back(&*JmpIt);
             ++JmpIt;
           } while (JmpIt != UseMBB.instr_end() &&
-                   X86::getCondFromBranch(*JmpIt) !=
-                       X86::COND_INVALID);
+                   X86::getCondFromBranch(*JmpIt) != X86::COND_INVALID);
           break;
         }
 
         // Otherwise we can just rewrite in-place.
-        if (X86::getCondFromCMov(MI) != X86::COND_INVALID ||
-            X86::getCondFromCFCMov(MI) != X86::COND_INVALID) {
-          rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
-        } else if (getCondFromFCMOV(MI.getOpcode()) != X86::COND_INVALID) {
-          rewriteFCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
-        } else if (X86::getCondFromSETCC(MI) != X86::COND_INVALID) {
-          rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
-        } else if (X86::getCondFromCCMP(MI) != X86::COND_INVALID) {
-          rewriteCCMP(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
-          FlagsKilled = true;
-        } else if (MI.getOpcode() == TargetOpcode::COPY) {
-          rewriteCopy(MI, *FlagUse, CopyDefI);
+        unsigned Opc = MI.getOpcode();
+        if (Opc == TargetOpcode::COPY) {
+          // Just replace this copy with the original copy def.
+          MRI->replaceRegWith(MI.getOperand(0).getReg(),
+                              CopyDefI.getOperand(0).getReg());
+          MI.eraseFromParent();
+        } else if (X86::isSETCC(Opc)) {
+          rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, CondRegs);
+        } else if (isArithmeticOp(Opc)) {
+          rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, CondRegs);
         } else {
-          // We assume all other instructions that use flags also def them.
-          assert(MI.findRegisterDefOperand(X86::EFLAGS, /*TRI=*/nullptr) &&
-                 "Expected a def of EFLAGS for this instruction!");
-
-          // NB!!! Several arithmetic instructions only *partially* update
-          // flags. Theoretically, we could generate MI code sequences that
-          // would rely on this fact and observe different flags independently.
-          // But currently LLVM models all of these instructions as clobbering
-          // all the flags in an undef way. We rely on that to simplify the
-          // logic.
-          FlagsKilled = true;
-
-          // Generically handle remaining uses as arithmetic instructions.
-          rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
-                            CondRegs);
+          rewriteMI(*TestMBB, TestPos, TestLoc, MI, CondRegs);
         }
 
         // If this was the last use of the flags, we're done.
@@ -702,7 +561,7 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
       else
         LastJmpMBB = JmpI->getParent();
 
-      rewriteCondJmp(*TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
+      rewriteMI(*TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
     }
 
     // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if
@@ -753,8 +612,8 @@ Register X86FlagsCopyLoweringPass::promoteCondToReg(
     MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
     const DebugLoc &TestLoc, X86::CondCode Cond) {
   Register Reg = MRI->createVirtualRegister(PromoteRC);
-  auto SetI = BuildMI(TestMBB, TestPos, TestLoc,
-                      TII->get(X86::SETCCr), Reg).addImm(Cond);
+  auto SetI = BuildMI(TestMBB, TestPos, TestLoc, TII->get(X86::SETCCr), Reg)
+                  .addImm(Cond);
   (void)SetI;
   LLVM_DEBUG(dbgs() << "    save cond: "; SetI->dump());
   ++NumSetCCsInserted;
@@ -785,43 +644,66 @@ void X86FlagsCopyLoweringPass::insertTest(MachineBasicBlock &MBB,
   ++NumTestsInserted;
 }
 
-void X86FlagsCopyLoweringPass::rewriteArithmetic(
-    MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
-    const DebugLoc &TestLoc, MachineInstr &MI, MachineOperand &FlagUse,
-    CondRegArray &CondRegs) {
-  // Arithmetic is either reading CF or OF. Figure out which condition we need
-  // to preserve in a register.
-  X86::CondCode Cond = X86::COND_INVALID;
+void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator Pos,
+                                            const DebugLoc &Loc,
+                                            MachineInstr &MI,
+                                            CondRegArray &CondRegs) {
+  X86::CondCode Cond = X86::getCondFromSETCC(MI);
+  // Note that we can't usefully rewrite this to the inverse without complex
+  // analysis of the users of the setCC. Largely we rely on duplicates which
+  // could have been avoided already being avoided here.
+  unsigned &CondReg = CondRegs[Cond];
+  if (!CondReg)
+    CondReg = promoteCondToReg(MBB, Pos, Loc, Cond);
 
-  // The addend to use to reset CF or OF when added to the flag value.
-  int Addend = 0;
-
-  switch (getMnemonicFromOpcode(MI.getOpcode())) {
-  case FlagArithMnemonic::ADC:
-  case FlagArithMnemonic::RCL:
-  case FlagArithMnemonic::RCR:
-  case FlagArithMnemonic::SBB:
-  case FlagArithMnemonic::SETB:
-    Cond = X86::COND_B; // CF == 1
-    // Set up an addend that when one is added will need a carry due to not
-    // having a higher bit available.
-    Addend = 255;
-    break;
+  // Rewriting a register def is trivial: we just replace the register and
+  // remove the setcc.
+  if (!MI.mayStore()) {
+    assert(MI.getOperand(0).isReg() &&
+           "Cannot have a non-register defined operand to SETcc!");
+    Register OldReg = MI.getOperand(0).getReg();
+    // Drop Kill flags on the old register before replacing. CondReg may have
+    // a longer live range.
+    MRI->clearKillFlags(OldReg);
+    MRI->replaceRegWith(OldReg, CondReg);
+    MI.eraseFromParent();
+    return;
   }
 
+  // Otherwise, we need to emit a store.
+  auto MIB = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+                     TII->get(X86::MOV8mr));
+  // Copy the address operands.
+  for (int i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.add(MI.getOperand(i));
+
+  MIB.addReg(CondReg);
+  MIB.setMemRefs(MI.memoperands());
+  MI.eraseFromParent();
+}
+
+void X86FlagsCopyLoweringPass::rewriteArithmetic(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator Pos,
+    const DebugLoc &Loc, MachineInstr &MI, CondRegArray &CondRegs) {
+  // Arithmetic is either reading CF or OF.
+  X86::CondCode Cond = X86::COND_B; // CF == 1
+  // The addend to use to reset CF or OF when added to the flag value.
+  // Set up an addend that when one is added will need a carry due to not
+  // having a higher bit available.
+  int Addend = 255;
+
   // Now get a register that contains the value of the flag input to the
   // arithmetic. We require exactly this flag to simplify the arithmetic
   // required to materialize it back into the flag.
   unsigned &CondReg = CondRegs[Cond];
   if (!CondReg)
-    CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
-
-  MachineBasicBlock &MBB = *MI.getParent();
+    CondReg = promoteCondToReg(MBB, Pos, Loc, Cond);
 
   // Insert an instruction that will set the flag back to the desired value.
   Register TmpReg = MRI->createVirtualRegister(PromoteRC);
   auto AddI =
-      BuildMI(MBB, MI.getIterator(), MI.getDebugLoc(),
+      BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
               TII->get(Subtarget->hasNDD() ? X86::ADD8ri_ND : X86::ADD8ri))
           .addDef(TmpReg, RegState::Dead)
           .addReg(CondReg)
@@ -829,177 +711,81 @@ void X86FlagsCopyLoweringPass::rewriteArithmetic(
   (void)AddI;
   LLVM_DEBUG(dbgs() << "    add cond: "; AddI->dump());
   ++NumAddsInserted;
-  FlagUse.setIsKill(true);
+  MI.findRegisterUseOperand(X86::EFLAGS, /*TRI=*/nullptr)->setIsKill(true);
 }
 
-void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB,
-                                           MachineBasicBlock::iterator TestPos,
-                                           const DebugLoc &TestLoc,
-                                           MachineInstr &CMovI,
-                                           MachineOperand &FlagUse,
-                                           CondRegArray &CondRegs) {
-  // First get the register containing this specific condition.
-  X86::CondCode Cond = X86::getCondFromCMov(CMovI) == X86::COND_INVALID
-                           ? X86::getCondFromCFCMov(CMovI)
-                           : X86::getCondFromCMov(CMovI);
-  unsigned CondReg;
-  bool Inverted;
-  std::tie(CondReg, Inverted) =
-      getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
-
-  MachineBasicBlock &MBB = *CMovI.getParent();
+static X86::CondCode getImplicitCondFromMI(unsigned Opc) {
+#define FROM_TO(A, B)                                                          \
+  case X86::CMOV##A##_Fp32:                                                    \
+  case X86::CMOV##A##_Fp64:                                                    \
+  case X86::CMOV##A##_Fp80:                                                    \
+    return X86::COND_##B;
 
-  // Insert a direct test of the saved register.
-  insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg);
-
-  // Rewrite the CMov to use the !ZF flag from the test, and then kill its use
-  // of the flags afterward.
-  CMovI.getOperand(CMovI.getDesc().getNumOperands() - 1)
-      .setImm(Inverted ? X86::COND_E : X86::COND_NE);
-  FlagUse.setIsKill(true);
-  LLVM_DEBUG(dbgs() << "    fixed cmov: "; CMovI.dump());
+  switch (Opc) {
+  default:
+    return X86::COND_INVALID;
+    FROM_TO(B, B)
+    FROM_TO(E, E)
+    FROM_TO(P, P)
+    FROM_TO(BE, BE)
+    FROM_TO(NB, AE)
+    FROM_TO(NE, NE)
+    FROM_TO(NP, NP)
+    FROM_TO(NBE, A)
+  }
+#undef FROM_TO
 }
 
-void X86FlagsCopyLoweringPass::rewriteFCMov(MachineBasicBlock &TestMBB,
-                                            MachineBasicBlock::iterator TestPos,
-                                            const DebugLoc &TestLoc,
-                                            MachineInstr &CMovI,
-                                            MachineOperand &FlagUse,
-                                            CondRegArray &CondRegs) {
-  // First get the register containing this specific condition.
-  X86::CondCode Cond = getCondFromFCMOV(CMovI.getOpcode());
-  unsigned CondReg;
-  bool Inverted;
-  std::tie(CondReg, Inverted) =
-      getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
-
-  MachineBasicBlock &MBB = *CMovI.getParent();
-
-  // Insert a direct test of the saved register.
-  insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg);
-
-  auto getFCMOVOpcode = [](unsigned Opcode, bool Inverted) {
-    switch (Opcode) {
-    default: llvm_unreachable("Unexpected opcode!");
-    case X86::CMOVBE_Fp32: case X86::CMOVNBE_Fp32:
-    case X86::CMOVB_Fp32:  case X86::CMOVNB_Fp32:
-    case X86::CMOVE_Fp32:  case X86::CMOVNE_Fp32:
-    case X86::CMOVP_Fp32:  case X86::CMOVNP_Fp32:
-      return Inverted ? X86::CMOVE_Fp32 : X86::CMOVNE_Fp32;
-    case X86::CMOVBE_Fp64: case X86::CMOVNBE_Fp64:
-    case X86::CMOVB_Fp64:  case X86::CMOVNB_Fp64:
-    case X86::CMOVE_Fp64:  case X86::CMOVNE_Fp64:
-    case X86::CMOVP_Fp64:  case X86::CMOVNP_Fp64:
-      return Inverted ? X86::CMOVE_Fp64 : X86::CMOVNE_Fp64;
-    case X86::CMOVBE_Fp80: case X86::CMOVNBE_Fp80:
-    case X86::CMOVB_Fp80:  case X86::CMOVNB_Fp80:
-    case X86::CMOVE_Fp80:  case X86::CMOVNE_Fp80:
-    case X86::CMOVP_Fp80:  case X86::CMOVNP_Fp80:
-      return Inverted ? X86::CMOVE_Fp80 : X86::CMOVNE_Fp80;
-    }
-  };
-
-  // Rewrite the CMov to use the !ZF flag from the test.
-  CMovI.setDesc(TII->get(getFCMOVOpcode(CMovI.getOpcode(), Inverted)));
-  FlagUse.setIsKill(true);
-  LLVM_DEBUG(dbgs() << "    fixed fcmov: "; CMovI.dump());
+static unsigned getOpcodeWithCC(unsigned Opc, X86::CondCode CC) {
+  assert((CC == X86::COND_E || CC == X86::COND_NE) && "Unexpected CC");
+#define CASE(A)                                                                \
+  case X86::CMOVB_##A:                                                         \
+  case X86::CMOVE_##A:                                                         \
+  case X86::CMOVP_##A:                                                         \
+  case X86::CMOVBE_##A:                                                        \
+  case X86::CMOVNB_##A:                                                        \
+  case X86::CMOVNE_##A:                                                        \
+  case X86::CMOVNP_##A:                                                        \
+  case X86::CMOVNBE_##A:                                                       \
+    return (CC == X86::COND_E) ? X86::CMOVE_##A : X86::CMOVNE_##A;
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unexpected opcode");
+    CASE(Fp32)
+    CASE(Fp64)
+    CASE(Fp80)
+  }
+#undef CASE
 }
 
-void X86FlagsCopyLoweringPass::rewriteCondJmp(
-    MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
-    const DebugLoc &TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) {
+void X86FlagsCopyLoweringPass::rewriteMI(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator Pos,
+                                         const DebugLoc &Loc, MachineInstr &MI,
+                                         CondRegArray &CondRegs) {
   // First get the register containing this specific condition.
-  X86::CondCode Cond = X86::getCondFromBranch(JmpI);
+  bool IsImplicitCC = false;
+  X86::CondCode CC = X86::getCondFromMI(MI);
+  if (CC == X86::COND_INVALID) {
+    CC = getImplicitCondFromMI(MI.getOpcode());
+    IsImplicitCC = true;
+  }
+  assert(CC != X86::COND_INVALID && "Unknown EFLAG user!");
   unsigned CondReg;
   bool Inverted;
   std::tie(CondReg, Inverted) =
-      getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
-
-  MachineBasicBlock &JmpMBB = *JmpI.getParent();
+      getCondOrInverseInReg(MBB, Pos, Loc, CC, CondRegs);
 
   // Insert a direct test of the saved register.
-  insertTest(JmpMBB, JmpI.getIterator(), JmpI.getDebugLoc(), CondReg);
-
-  // Rewrite the jump to use the !ZF flag from the test, and kill its use of
-  // flags afterward.
-  JmpI.getOperand(1).setImm(Inverted ? X86::COND_E : X86::COND_NE);
-  JmpI.findRegisterUseOperand(X86::EFLAGS, /*TRI=*/nullptr)->setIsKill(true);
-  LLVM_DEBUG(dbgs() << "    fixed jCC: "; JmpI.dump());
-}
-
-void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI,
-                                           MachineOperand &FlagUse,
-                                           MachineInstr &CopyDefI) {
-  // Just replace this copy with the original copy def.
-  MRI->replaceRegWith(MI.getOperand(0).getReg(),
-                      CopyDefI.getOperand(0).getReg());
-  MI.eraseFromParent();
-}
-
-void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB,
-                                            MachineBasicBlock::iterator TestPos,
-                                            const DebugLoc &TestLoc,
-                                            MachineInstr &SetCCI,
-                                            MachineOperand &FlagUse,
-                                            CondRegArray &CondRegs) {
-  X86::CondCode Cond = X86::getCondFromSETCC(SetCCI);
-  // Note that we can't usefully rewrite this to the inverse without complex
-  // analysis of the users of the setCC. Largely we rely on duplicates which
-  // could have been avoided already being avoided here.
-  unsigned &CondReg = CondRegs[Cond];
-  if (!CondReg)
-    CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, Cond);
-
-  // Rewriting a register def is trivial: we just replace the register and
-  // remove the setcc.
-  if (!SetCCI.mayStore()) {
-    assert(SetCCI.getOperand(0).isReg() &&
-           "Cannot have a non-register defined operand to SETcc!");
-    Register OldReg = SetCCI.getOperand(0).getReg();
-    // Drop Kill flags on the old register before replacing. CondReg may have
-    // a longer live range.
-    MRI->clearKillFlags(OldReg);
-    MRI->replaceRegWith(OldReg, CondReg);
-    SetCCI.eraseFromParent();
-    return;
-  }
+  insertTest(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), CondReg);
 
-  // Otherwise, we need to emit a store.
-  auto MIB = BuildMI(*SetCCI.getParent(), SetCCI.getIterator(),
-                     SetCCI.getDebugLoc(), TII->get(X86::MOV8mr));
-  // Copy the address operands.
-  for (int i = 0; i < X86::AddrNumOperands; ++i)
-    MIB.add(SetCCI.getOperand(i));
-
-  MIB.addReg(CondReg);
-
-  MIB.setMemRefs(SetCCI.memoperands());
-
-  SetCCI.eraseFromParent();
-}
-
-void X86FlagsCopyLoweringPass::rewriteCCMP(MachineBasicBlock &TestMBB,
-                                           MachineBasicBlock::iterator TestPos,
-                                           const DebugLoc &TestLoc,
-                                           MachineInstr &CCMPI,
-                                           MachineOperand &FlagUse,
-                                           CondRegArray &CondRegs) {
-  // First get the register containing this specific condition.
-  X86::CondCode Cond = X86::getCondFromCCMP(CCMPI);
-  unsigned CondReg;
-  bool Inverted;
-  std::tie(CondReg, Inverted) =
-      getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
-
-  MachineBasicBlock &MBB = *CCMPI.getParent();
+  // Rewrite the instruction to use the !ZF flag from the test, and then kill
+  // its use of the flags afterward.
+  X86::CondCode NewCC = Inverted ? X86::COND_E : X86::COND_NE;
+  if (IsImplicitCC)
+    MI.setDesc(TII->get(getOpcodeWithCC(MI.getOpcode(), NewCC)));
+  else
+    MI.getOperand(MI.getDesc().getNumOperands() - 1).setImm(NewCC);
 
-  // Insert a direct test of the saved register.
-  insertTest(MBB, CCMPI.getIterator(), CCMPI.getDebugLoc(), CondReg);
-
-  // Rewrite the CCMP/CTEST to use the !ZF flag from the test, and then kill its
-  // use of the flags afterward.
-  CCMPI.getOperand(CCMPI.getDesc().getNumOperands() - 1)
-      .setImm(Inverted ? X86::COND_E : X86::COND_NE);
-  FlagUse.setIsKill(true);
-  LLVM_DEBUG(dbgs() << "    fixed ccmp/ctest: "; CCMPI.dump());
+  MI.findRegisterUseOperand(X86::EFLAGS, /*TRI=*/nullptr)->setIsKill(true);
+  LLVM_DEBUG(dbgs() << "    fixed instruction: "; MI.dump());
 }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5d0846453685..ca32cfe54233 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1108,13 +1108,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
     }
 
-    setOperationAction(ISD::ABDU,               MVT::v16i8, Custom);
-    setOperationAction(ISD::ABDS,               MVT::v16i8, Custom);
-    setOperationAction(ISD::ABDU,               MVT::v8i16, Custom);
-    setOperationAction(ISD::ABDS,               MVT::v8i16, Custom);
-    setOperationAction(ISD::ABDU,               MVT::v4i32, Custom);
-    setOperationAction(ISD::ABDS,               MVT::v4i32, Custom);
-
     setOperationAction(ISD::UADDSAT,            MVT::v16i8, Legal);
     setOperationAction(ISD::SADDSAT,            MVT::v16i8, Legal);
     setOperationAction(ISD::USUBSAT,            MVT::v16i8, Legal);
@@ -1132,9 +1125,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 
     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
-      setOperationAction(ISD::SETCC,              VT, Custom);
-      setOperationAction(ISD::CTPOP,              VT, Custom);
-      setOperationAction(ISD::ABS,                VT, Custom);
+      setOperationAction(ISD::SETCC, VT, Custom);
+      setOperationAction(ISD::CTPOP, VT, Custom);
+      setOperationAction(ISD::ABS, VT, Custom);
+      setOperationAction(ISD::ABDS, VT, Custom);
+      setOperationAction(ISD::ABDU, VT, Custom);
 
       // The condition codes aren't legal in SSE/AVX and under AVX512 we use
       // setcc all the way to isel and prefer SETGT in some isel patterns.
@@ -1336,11 +1331,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
 
-    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
-      setOperationAction(ISD::ABDS,             VT, Custom);
-      setOperationAction(ISD::ABDU,             VT, Custom);
-    }
-
     setOperationAction(ISD::UADDSAT,            MVT::v4i32, Custom);
     setOperationAction(ISD::SADDSAT,            MVT::v2i64, Custom);
     setOperationAction(ISD::SSUBSAT,            MVT::v2i64, Custom);
@@ -2032,6 +2022,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
       setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
     }
+
+    setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
+    setOperationAction(ISD::FABS, MVT::v32f16, Custom);
+    setOperationAction(ISD::FCOPYSIGN, MVT::v32f16, Custom);
   }// useAVX512Regs
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
@@ -2108,9 +2102,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
         setOperationAction(ISD::CTPOP, VT, Legal);
     }
-    setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
-    setOperationAction(ISD::FABS, MVT::v32f16, Custom);
-    setOperationAction(ISD::FCOPYSIGN, MVT::v32f16, Custom);
   }
 
   // This block control legalization of v32i1/v64i1 which are available with
@@ -3292,7 +3283,7 @@ bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
   if (VT != MVT::i32 && VT != MVT::i64)
     return false;
 
-  return !isa<ConstantSDNode>(Y);
+  return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
 }
 
 bool X86TargetLowering::hasAndNot(SDValue Y) const {
@@ -20130,12 +20121,11 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
   return Res;
 }
 
-static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
+static SDValue LowerAVXExtend(SDValue Op, const SDLoc &dl, SelectionDAG &DAG,
                               const X86Subtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
   SDValue In = Op.getOperand(0);
   MVT InVT = In.getSimpleValueType();
-  SDLoc dl(Op);
   unsigned Opc = Op.getOpcode();
 
   assert(VT.isVector() && InVT.isVector() && "Expected vector type");
@@ -20206,14 +20196,13 @@ static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
   return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
 }
 
-static  SDValue LowerZERO_EXTEND_Mask(SDValue Op,
-                                      const X86Subtarget &Subtarget,
-                                      SelectionDAG &DAG) {
+static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const SDLoc &DL,
+                                     const X86Subtarget &Subtarget,
+                                     SelectionDAG &DAG) {
   MVT VT = Op->getSimpleValueType(0);
   SDValue In = Op->getOperand(0);
   MVT InVT = In.getSimpleValueType();
   assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
-  SDLoc DL(Op);
   unsigned NumElts = VT.getVectorNumElements();
 
   // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
@@ -20268,12 +20257,13 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
                                 SelectionDAG &DAG) {
   SDValue In = Op.getOperand(0);
   MVT SVT = In.getSimpleValueType();
+  SDLoc DL(Op);
 
   if (SVT.getVectorElementType() == MVT::i1)
-    return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
+    return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
 
   assert(Subtarget.hasAVX() && "Expected AVX support");
-  return LowerAVXExtend(Op, DAG, Subtarget);
+  return LowerAVXExtend(Op, DL, DAG, Subtarget);
 }
 
 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
@@ -24320,7 +24310,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
 }
 
-static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
+static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const SDLoc &dl,
                                      const X86Subtarget &Subtarget,
                                      SelectionDAG &DAG) {
   MVT VT = Op->getSimpleValueType(0);
@@ -24328,8 +24318,6 @@ static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
   MVT InVT = In.getSimpleValueType();
   assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
   MVT VTElt = VT.getVectorElementType();
-  SDLoc dl(Op);
-
   unsigned NumElts = VT.getVectorNumElements();
 
   // Extend VT if the scalar type is i8/i16 and BWI is not supported.
@@ -24381,12 +24369,13 @@ static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
                                SelectionDAG &DAG) {
   SDValue In = Op->getOperand(0);
   MVT InVT = In.getSimpleValueType();
+  SDLoc DL(Op);
 
   if (InVT.getVectorElementType() == MVT::i1)
-    return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
+    return LowerSIGN_EXTEND_Mask(Op, DL, Subtarget, DAG);
 
   assert(Subtarget.hasAVX() && "Expected AVX support");
-  return LowerAVXExtend(Op, DAG, Subtarget);
+  return LowerAVXExtend(Op, DL, DAG, Subtarget);
 }
 
 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
@@ -24524,7 +24513,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
   SDLoc dl(Op);
 
   if (InVT.getVectorElementType() == MVT::i1)
-    return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
+    return LowerSIGN_EXTEND_Mask(Op, dl, Subtarget, DAG);
 
   assert(VT.isVector() && InVT.isVector() && "Expected vector type");
   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
@@ -28421,18 +28410,6 @@ static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
     }
   }
 
-  // TODO: Move to TargetLowering expandABD().
-  if (!Subtarget.hasSSE41() &&
-      ((IsSigned && VT == MVT::v16i8) || VT == MVT::v4i32)) {
-    SDValue LHS = DAG.getFreeze(Op.getOperand(0));
-    SDValue RHS = DAG.getFreeze(Op.getOperand(1));
-    ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT;
-    SDValue Cmp = DAG.getSetCC(dl, VT, LHS, RHS, CC);
-    SDValue Diff0 = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
-    SDValue Diff1 = DAG.getNode(ISD::SUB, dl, VT, RHS, LHS);
-    return getBitSelect(dl, VT, Diff0, Diff1, Cmp, DAG);
-  }
-
   // Default to expand.
   return SDValue();
 }
@@ -33849,18 +33826,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(ADDSUB)
   NODE_NAME_CASE(RCP14)
   NODE_NAME_CASE(RCP14S)
-  NODE_NAME_CASE(RCP28)
-  NODE_NAME_CASE(RCP28_SAE)
-  NODE_NAME_CASE(RCP28S)
-  NODE_NAME_CASE(RCP28S_SAE)
-  NODE_NAME_CASE(EXP2)
-  NODE_NAME_CASE(EXP2_SAE)
   NODE_NAME_CASE(RSQRT14)
   NODE_NAME_CASE(RSQRT14S)
-  NODE_NAME_CASE(RSQRT28)
-  NODE_NAME_CASE(RSQRT28_SAE)
-  NODE_NAME_CASE(RSQRT28S)
-  NODE_NAME_CASE(RSQRT28S_SAE)
   NODE_NAME_CASE(FADD_RND)
   NODE_NAME_CASE(FADDS)
   NODE_NAME_CASE(FADDS_RND)
@@ -42963,7 +42930,6 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
     bool PoisonOnly, unsigned Depth) const {
   unsigned NumElts = DemandedElts.getBitWidth();
 
-  // TODO: Add more target shuffles.
   switch (Op.getOpcode()) {
   case X86ISD::PSHUFD:
   case X86ISD::VPERMILPI: {
@@ -42999,8 +42965,12 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
     bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
 
-  // TODO: Add more target shuffles.
   switch (Op.getOpcode()) {
+  // SSE vector shifts handle out of bounds shift amounts.
+  case X86ISD::VSHLI:
+  case X86ISD::VSRLI:
+  case X86ISD::VSRAI:
+    return false;
   case X86ISD::PSHUFD:
   case X86ISD::VPERMILPI:
   case X86ISD::UNPCKH:
@@ -43443,7 +43413,11 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
 // the chain.
 static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
                                           SelectionDAG &DAG,
-                                          const X86Subtarget &Subtarget) {
+                                          const X86Subtarget &Subtarget,
+                                          unsigned Depth = 0) {
+  if (Depth >= SelectionDAG::MaxRecursionDepth)
+    return SDValue(); // Limit search depth.
+
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   unsigned Opc = V.getOpcode();
   switch (Opc) {
@@ -43455,14 +43429,22 @@ static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
       return DAG.getBitcast(VT, Src);
     break;
   }
+  case ISD::Constant: {
+    auto *C = cast<ConstantSDNode>(V);
+    if (C->isZero())
+      return DAG.getConstant(0, DL, VT);
+    if (C->isAllOnes())
+      return DAG.getAllOnesConstant(DL, VT);
+    break;
+  }
   case ISD::TRUNCATE: {
     // If we find a suitable source, a truncated scalar becomes a subvector.
     SDValue Src = V.getOperand(0);
     EVT NewSrcVT =
         EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
     if (TLI.isTypeLegal(NewSrcVT))
-      if (SDValue N0 =
-              combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
+      if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
+                                                  Subtarget, Depth + 1))
         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
                            DAG.getIntPtrConstant(0, DL));
     break;
@@ -43474,20 +43456,22 @@ static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
     EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
                                     Src.getScalarValueSizeInBits());
     if (TLI.isTypeLegal(NewSrcVT))
-      if (SDValue N0 =
-              combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
+      if (SDValue N0 = combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG,
+                                                  Subtarget, Depth + 1))
         return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
                            Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
                                                   : DAG.getConstant(0, DL, VT),
                            N0, DAG.getIntPtrConstant(0, DL));
     break;
   }
-  case ISD::OR: {
-    // If we find suitable sources, we can just move an OR to the vector domain.
-    SDValue Src0 = V.getOperand(0);
-    SDValue Src1 = V.getOperand(1);
-    if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
-      if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
+  case ISD::OR:
+  case ISD::XOR: {
+    // If we find suitable sources, we can just move the op to the vector
+    // domain.
+    if (SDValue N0 = combineBitcastToBoolVector(VT, V.getOperand(0), DL, DAG,
+                                                Subtarget, Depth + 1))
+      if (SDValue N1 = combineBitcastToBoolVector(VT, V.getOperand(1), DL, DAG,
+                                                  Subtarget, Depth + 1))
         return DAG.getNode(Opc, DL, VT, N0, N1);
     break;
   }
@@ -43499,13 +43483,20 @@ static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,
       break;
 
     if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
-      if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
+      if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget,
+                                                  Depth + 1))
         return DAG.getNode(
             X86ISD::KSHIFTL, DL, VT, N0,
             DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
     break;
   }
   }
+
+  // Does the inner bitcast already exist?
+  if (Depth > 0)
+    if (SDNode *Alt = DAG.getNodeIfExists(ISD::BITCAST, DAG.getVTList(VT), {V}))
+      return SDValue(Alt, 0);
+
   return SDValue();
 }
 
@@ -43694,14 +43685,14 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
     return combinevXi1ConstantToInteger(N0, DAG);
   }
 
-  if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
-      VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
-      isa<ConstantSDNode>(N0)) {
-    auto *C = cast<ConstantSDNode>(N0);
-    if (C->isAllOnes())
-      return DAG.getConstant(1, SDLoc(N0), VT);
-    if (C->isZero())
-      return DAG.getConstant(0, SDLoc(N0), VT);
+  if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && VT.isVector() &&
+      VT.getVectorElementType() == MVT::i1) {
+    if (auto *C = dyn_cast<ConstantSDNode>(N0)) {
+      if (C->isAllOnes())
+        return DAG.getConstant(1, SDLoc(N0), VT);
+      if (C->isZero())
+        return DAG.getConstant(0, SDLoc(N0), VT);
+    }
   }
 
   // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index ade54f73bff0..14b9eb732943 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -699,18 +699,6 @@ namespace llvm {
     // Test if in transactional execution.
     XTEST,
 
-    // ERI instructions.
-    RSQRT28,
-    RSQRT28_SAE,
-    RSQRT28S,
-    RSQRT28S_SAE,
-    RCP28,
-    RCP28_SAE,
-    RCP28S,
-    RCP28S_SAE,
-    EXP2,
-    EXP2_SAE,
-
     // Conversions between float and half-float.
     CVTPS2PH,
     CVTPS2PH_SAE,
diff --git a/llvm/lib/Target/X86/X86Instr3DNow.td b/llvm/lib/Target/X86/X86Instr3DNow.td
index 3be03ab0f433..03612de0fad9 100644
--- a/llvm/lib/Target/X86/X86Instr3DNow.td
+++ b/llvm/lib/Target/X86/X86Instr3DNow.td
@@ -90,8 +90,7 @@ def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr",
                   TB, Requires<[HasPrefetchW]>;
 
 def PREFETCHWT1 : I<0x0D, MRM2m, (outs), (ins i8mem:$addr), "prefetchwt1\t$addr",
-                    [(prefetch addr:$addr, (i32 1), (i32 PrefetchWT1Level), (i32 1))]>,
-                    TB, Requires<[HasPREFETCHWT1]>;
+                    []>, TB;
 }
 
 // "3DNowA" instructions
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 0723328d40e3..da690aea43f5 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -9265,6 +9265,37 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
   }
 }
 
+multiclass avx512_fp28_s_ass<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                             X86FoldableSchedWrite sched> {
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR], hasSideEffects = 0 in {
+  defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                           (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                           "$src2, $src1", "$src1, $src2",
+                           (null_frag)>, Sched<[sched]>, SIMD_EXC;
+  defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                            "{sae}, $src2, $src1", "$src1, $src2, {sae}",
+                            (null_frag)>, EVEX_B, Sched<[sched]>;
+  let mayLoad = 1 in
+  defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
+                         "$src2, $src1", "$src1, $src2",
+                         (null_frag)>,
+                         Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+  }
+}
+
+multiclass avx512_eri_s_ass<bits<8> opc, string OpcodeStr,
+                            X86FoldableSchedWrite sched> {
+  defm SSZ : avx512_fp28_s_ass<opc, OpcodeStr#"ss", f32x_info, sched>,
+             EVEX_CD8<32, CD8VT1>, VEX_LIG, T8, PD, EVEX, VVVV;
+  defm SDZ : avx512_fp28_s_ass<opc, OpcodeStr#"sd", f64x_info, sched>,
+             EVEX_CD8<64, CD8VT1>, VEX_LIG, REX_W, T8, PD, EVEX, VVVV;
+}
+
+defm VRCP28   : avx512_eri_s_ass<0xCB, "vrcp28", SchedWriteFRcp.Scl>;
+defm VRSQRT28 : avx512_eri_s_ass<0xCD, "vrsqrt28", SchedWriteFRsqrt.Scl>;
+
 multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                         SDNode OpNodeSAE, X86FoldableSchedWrite sched> {
   defm SSZ : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, OpNodeSAE,
@@ -9280,13 +9311,6 @@ multiclass avx512_vgetexpsh<bits<8> opc, string OpcodeStr, SDNode OpNode,
                EVEX_CD8<16, CD8VT1>, T_MAP6, PD, EVEX, VVVV;
 }
 
-let Predicates = [HasERI] in {
-  defm VRCP28   : avx512_eri_s<0xCB, "vrcp28", X86rcp28s, X86rcp28SAEs,
-                               SchedWriteFRcp.Scl>;
-  defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s, X86rsqrt28SAEs,
-                               SchedWriteFRsqrt.Scl>;
-}
-
 defm VGETEXP   : avx512_eri_s<0x43, "vgetexp", X86fgetexps, X86fgetexpSAEs,
                               SchedWriteFRnd.Scl>,
                  avx512_vgetexpsh<0x43, "vgetexp", X86fgetexps, X86fgetexpSAEs,
@@ -9325,6 +9349,49 @@ multiclass avx512_fp28_p_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                         EVEX_B, Sched<[sched]>;
 }
 
+multiclass avx512_fp28_p_ass<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                             X86FoldableSchedWrite sched> {
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1,
+    hasSideEffects = 0 in {
+  defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                         (ins _.RC:$src), OpcodeStr, "$src", "$src",
+                         (null_frag)>, Sched<[sched]>;
+  let mayLoad = 1 in
+  defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
+                         (null_frag)>,
+                         Sched<[sched.Folded, sched.ReadAfterFold]>;
+  let mayLoad = 1 in
+  defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.ScalarMemOp:$src), OpcodeStr,
+                         "${src}"#_.BroadcastStr, "${src}"#_.BroadcastStr,
+                         (null_frag)>,
+                         EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+  }
+}
+multiclass avx512_fp28_p_sae_ass<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
+                                X86FoldableSchedWrite sched> {
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR], hasSideEffects = 0 in
+  defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                        (ins _.RC:$src), OpcodeStr,
+                        "{sae}, $src", "$src, {sae}",
+                        (null_frag)>, Sched<[sched]>, EVEX_B;
+}
+
+multiclass  avx512_eri_ass<bits<8> opc, string OpcodeStr,
+                           X86SchedWriteWidths sched> {
+   defm PSZ : avx512_fp28_p_ass<opc, OpcodeStr#"ps", v16f32_info, sched.ZMM>,
+              avx512_fp28_p_sae_ass<opc, OpcodeStr#"ps", v16f32_info, sched.ZMM>,
+              T8, PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+   defm PDZ : avx512_fp28_p_ass<opc, OpcodeStr#"pd", v8f64_info, sched.ZMM>,
+              avx512_fp28_p_sae_ass<opc, OpcodeStr#"pd", v8f64_info, sched.ZMM>,
+              T8, PD, EVEX_V512, REX_W, EVEX_CD8<64, CD8VF>;
+}
+
+defm VRSQRT28 : avx512_eri_ass<0xCC, "vrsqrt28", SchedWriteFRsqrt>, EVEX;
+defm VRCP28   : avx512_eri_ass<0xCA, "vrcp28", SchedWriteFRcp>, EVEX;
+defm VEXP2    : avx512_eri_ass<0xC8, "vexp2", SchedWriteFAdd>, EVEX;
+
 multiclass  avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode,
                        SDNode OpNodeSAE, X86SchedWriteWidths sched> {
    defm PSZ : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode, sched.ZMM>,
@@ -9367,14 +9434,6 @@ multiclass  avx512_vgetexp_fp16<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                      EVEX_V256, T_MAP6, PD, EVEX_CD8<16, CD8VF>;
   }
 }
-let Predicates = [HasERI] in {
- defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, X86rsqrt28SAE,
-                            SchedWriteFRsqrt>, EVEX;
- defm VRCP28   : avx512_eri<0xCA, "vrcp28", X86rcp28, X86rcp28SAE,
-                            SchedWriteFRcp>, EVEX;
- defm VEXP2    : avx512_eri<0xC8, "vexp2", X86exp2, X86exp2SAE,
-                            SchedWriteFAdd>, EVEX;
-}
 defm VGETEXP   : avx512_eri<0x42, "vgetexp", X86fgetexp, X86fgetexpSAE,
                             SchedWriteFRnd>,
                  avx512_vgetexp_fp16<0x42, "vgetexp", X86fgetexp, X86fgetexpSAE,
@@ -10308,7 +10367,7 @@ defm VPSCATTER : avx512_scatter_q_pd<0xA0, 0xA1, avx512vl_i64_info, "vpscatter",
 // prefetch
 multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr,
                        RegisterClass KRC, X86MemOperand memop> {
-  let Predicates = [HasPFI], mayLoad = 1, mayStore = 1 in
+  let mayLoad = 1, mayStore = 1 in
   def m  : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src),
             !strconcat(OpcodeStr, "\t{$src {${mask}}|{${mask}}, $src}"), []>,
             EVEX, EVEX_K, Sched<[WriteLoad]>;
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index f14c7200af96..142e1867e616 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -607,14 +607,8 @@ def X86any_fcmp : PatFrags<(ops node:$lhs, node:$rhs),
                           [(X86strict_fcmp node:$lhs, node:$rhs),
                            (X86fcmp node:$lhs, node:$rhs)]>;
 
-// PREFETCHWT1 is supported we want to use it for everything but T0.
 def PrefetchWLevel : PatFrag<(ops), (i32 timm), [{
-  return N->getSExtValue() == 3 || !Subtarget->hasPREFETCHWT1();
-}]>;
-
-// Use PREFETCHWT1 for NTA, T2, T1.
-def PrefetchWT1Level : TImmLeaf<i32, [{
-  return Imm < 3;
+  return N->getSExtValue() <= 3;
 }]>;
 
 def X86lock_add_nocf : PatFrag<(ops node:$lhs, node:$rhs),
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index f86e15b3ed5d..dff33a469b97 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -600,19 +600,8 @@ def X86Vpdpbusds : SDNode<"X86ISD::VPDPBUSDS", SDTVnni>;
 def X86Vpdpwssd  : SDNode<"X86ISD::VPDPWSSD", SDTVnni>;
 def X86Vpdpwssds : SDNode<"X86ISD::VPDPWSSDS", SDTVnni>;
 
-def X86rsqrt28   : SDNode<"X86ISD::RSQRT28",     SDTFPUnaryOp>;
-def X86rsqrt28SAE: SDNode<"X86ISD::RSQRT28_SAE", SDTFPUnaryOp>;
-def X86rcp28     : SDNode<"X86ISD::RCP28",       SDTFPUnaryOp>;
-def X86rcp28SAE  : SDNode<"X86ISD::RCP28_SAE",   SDTFPUnaryOp>;
-def X86exp2      : SDNode<"X86ISD::EXP2",        SDTFPUnaryOp>;
-def X86exp2SAE   : SDNode<"X86ISD::EXP2_SAE",    SDTFPUnaryOp>;
-
 def X86rsqrt14s  : SDNode<"X86ISD::RSQRT14S",   SDTFPBinOp>;
 def X86rcp14s    : SDNode<"X86ISD::RCP14S",     SDTFPBinOp>;
-def X86rsqrt28s  : SDNode<"X86ISD::RSQRT28S",   SDTFPBinOp>;
-def X86rsqrt28SAEs : SDNode<"X86ISD::RSQRT28S_SAE", SDTFPBinOp>;
-def X86rcp28s    : SDNode<"X86ISD::RCP28S",     SDTFPBinOp>;
-def X86rcp28SAEs : SDNode<"X86ISD::RCP28S_SAE", SDTFPBinOp>;
 def X86Ranges    : SDNode<"X86ISD::VRANGES",    SDTFPBinOpImm>;
 def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImm>;
 def X86Reduces   : SDNode<"X86ISD::VREDUCES",   SDTFPBinOpImm>;
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 9f2709d6b1a2..419ff9e6f5c0 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -79,8 +79,6 @@ def UseAVX2      : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
 def NoAVX512     : Predicate<"!Subtarget->hasAVX512()">;
 def HasCDI       : Predicate<"Subtarget->hasCDI()">;
 def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">;
-def HasPFI       : Predicate<"Subtarget->hasPFI()">;
-def HasERI       : Predicate<"Subtarget->hasERI()">;
 def HasDQI       : Predicate<"Subtarget->hasDQI()">;
 def NoDQI        : Predicate<"!Subtarget->hasDQI()">;
 def HasBWI       : Predicate<"Subtarget->hasBWI()">;
@@ -147,7 +145,6 @@ def NoSSEPrefetch : Predicate<"!Subtarget->hasSSEPrefetch()">;
 def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;
 def HasPREFETCHI : Predicate<"Subtarget->hasPREFETCHI()">;
 def HasPrefetchW : Predicate<"Subtarget->hasPrefetchW()">;
-def HasPREFETCHWT1 : Predicate<"Subtarget->hasPREFETCHWT1()">;
 def HasLAHFSAHF  : Predicate<"Subtarget->hasLAHFSAHF()">;
 def HasLAHFSAHF64 : Predicate<"Subtarget->hasLAHFSAHF64()">;
 def HasMWAITX    : Predicate<"Subtarget->hasMWAITX()">;
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 3bb2f07b5f1a..e3961e0094d3 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -108,15 +108,6 @@ static const IntrinsicData IntrinsicsWithChain[] = {
   X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, 0, 0),
   X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, 0, 0),
 
-  X86_INTRINSIC_DATA(avx512_gatherpf_dpd_512, PREFETCH,
-                     X86::VGATHERPF0DPDm, X86::VGATHERPF1DPDm),
-  X86_INTRINSIC_DATA(avx512_gatherpf_dps_512, PREFETCH,
-                     X86::VGATHERPF0DPSm, X86::VGATHERPF1DPSm),
-  X86_INTRINSIC_DATA(avx512_gatherpf_qpd_512, PREFETCH,
-                     X86::VGATHERPF0QPDm, X86::VGATHERPF1QPDm),
-  X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH,
-                     X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm),
-
   X86_INTRINSIC_DATA(avx512_mask_gather_dpd_512, GATHER, 0, 0),
   X86_INTRINSIC_DATA(avx512_mask_gather_dpi_512, GATHER, 0, 0),
   X86_INTRINSIC_DATA(avx512_mask_gather_dpq_512, GATHER, 0, 0),
@@ -292,14 +283,6 @@ static const IntrinsicData IntrinsicsWithChain[] = {
   X86_INTRINSIC_DATA(avx512_scatterdiv4_si, SCATTER, 0, 0),
   X86_INTRINSIC_DATA(avx512_scatterdiv8_sf, SCATTER, 0, 0),
   X86_INTRINSIC_DATA(avx512_scatterdiv8_si, SCATTER, 0, 0),
-  X86_INTRINSIC_DATA(avx512_scatterpf_dpd_512, PREFETCH, X86::VSCATTERPF0DPDm,
-                     X86::VSCATTERPF1DPDm),
-  X86_INTRINSIC_DATA(avx512_scatterpf_dps_512, PREFETCH, X86::VSCATTERPF0DPSm,
-                     X86::VSCATTERPF1DPSm),
-  X86_INTRINSIC_DATA(avx512_scatterpf_qpd_512, PREFETCH, X86::VSCATTERPF0QPDm,
-                     X86::VSCATTERPF1QPDm),
-  X86_INTRINSIC_DATA(avx512_scatterpf_qps_512, PREFETCH, X86::VSCATTERPF0QPSm,
-                     X86::VSCATTERPF1QPSm),
   X86_INTRINSIC_DATA(avx512_scattersiv2_df, SCATTER, 0, 0),
   X86_INTRINSIC_DATA(avx512_scattersiv2_di, SCATTER, 0, 0),
   X86_INTRINSIC_DATA(avx512_scattersiv4_df, SCATTER, 0, 0),
@@ -454,8 +437,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_dbpsadbw_512, INTR_TYPE_3OP_IMM8, X86ISD::DBPSADBW, 0),
   X86_INTRINSIC_DATA(avx512_div_pd_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND),
   X86_INTRINSIC_DATA(avx512_div_ps_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND),
-  X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_SAE, X86ISD::EXP2, X86ISD::EXP2_SAE),
-  X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_SAE, X86ISD::EXP2, X86ISD::EXP2_SAE),
   X86_INTRINSIC_DATA(avx512_fpclass_pd_128, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
   X86_INTRINSIC_DATA(avx512_fpclass_pd_256, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
   X86_INTRINSIC_DATA(avx512_fpclass_pd_512, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
@@ -908,10 +889,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
   X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
   X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
-  X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_SAE, X86ISD::RCP28, X86ISD::RCP28_SAE),
-  X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_SAE, X86ISD::RCP28, X86ISD::RCP28_SAE),
-  X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_SAE, X86ISD::RCP28S, X86ISD::RCP28S_SAE),
-  X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_SAE, X86ISD::RCP28S, X86ISD::RCP28S_SAE),
   X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
@@ -920,10 +897,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
-  X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_SAE,X86ISD::RSQRT28, X86ISD::RSQRT28_SAE),
-  X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_SAE,X86ISD::RSQRT28, X86ISD::RSQRT28_SAE),
-  X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_SAE,X86ISD::RSQRT28S, X86ISD::RSQRT28S_SAE),
-  X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_SAE,X86ISD::RSQRT28S, X86ISD::RSQRT28S_SAE),
   X86_INTRINSIC_DATA(avx512_sitofp_round, INTR_TYPE_1OP, ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND),
   X86_INTRINSIC_DATA(avx512_sqrt_pd_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND),
   X86_INTRINSIC_DATA(avx512_sqrt_ps_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND),
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 4d55a084b730..4532db134fcb 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -213,17 +213,15 @@ public:
   bool hasAnyFMA() const { return hasFMA() || hasFMA4(); }
   bool hasPrefetchW() const {
     // The PREFETCHW instruction was added with 3DNow but later CPUs gave it
-    // its own CPUID bit as part of deprecating 3DNow. Intel eventually added
-    // it and KNL has another that prefetches to L2 cache. We assume the
+    // its own CPUID bit as part of deprecating 3DNow. We assume the
     // L1 version exists if the L2 version does.
-    return hasThreeDNow() || hasPRFCHW() || hasPREFETCHWT1();
+    return hasThreeDNow() || hasPRFCHW();
   }
   bool hasSSEPrefetch() const {
     // We implicitly enable these when we have a write prefix supporting cache
     // level OR if we have prfchw, but don't already have a read prefetch from
     // 3dnow.
-    return hasSSE1() || (hasPRFCHW() && !hasThreeDNow()) || hasPREFETCHWT1() ||
-           hasPREFETCHI();
+    return hasSSE1() || (hasPRFCHW() && !hasThreeDNow()) || hasPREFETCHI();
   }
   bool canUseLAHFSAHF() const { return hasLAHFSAHF64() || !is64Bit(); }
   // These are generic getters that OR together all of the thunk types
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index c5156c6cb802..68155acd9e5b 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1005,8 +1005,6 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
         CPU = "cascadelake";
       } else if (testFeature(X86::FEATURE_AVX512VL)) {
         CPU = "skylake-avx512";
-      } else if (testFeature(X86::FEATURE_AVX512ER)) {
-        CPU = "knl";
       } else if (testFeature(X86::FEATURE_CLFLUSHOPT)) {
         if (testFeature(X86::FEATURE_SHA))
           CPU = "goldmont";
@@ -1300,10 +1298,6 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
     setFeature(X86::FEATURE_AVX512IFMA);
   if (HasLeaf7 && ((EBX >> 23) & 1))
     setFeature(X86::FEATURE_CLFLUSHOPT);
-  if (HasLeaf7 && ((EBX >> 26) & 1) && HasAVX512Save)
-    setFeature(X86::FEATURE_AVX512PF);
-  if (HasLeaf7 && ((EBX >> 27) & 1) && HasAVX512Save)
-    setFeature(X86::FEATURE_AVX512ER);
   if (HasLeaf7 && ((EBX >> 28) & 1) && HasAVX512Save)
     setFeature(X86::FEATURE_AVX512CD);
   if (HasLeaf7 && ((EBX >> 29) & 1))
@@ -1810,14 +1804,11 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["avx512ifma"] = HasLeaf7 && ((EBX >> 21) & 1) && HasAVX512Save;
   Features["clflushopt"] = HasLeaf7 && ((EBX >> 23) & 1);
   Features["clwb"]       = HasLeaf7 && ((EBX >> 24) & 1);
-  Features["avx512pf"]   = HasLeaf7 && ((EBX >> 26) & 1) && HasAVX512Save;
-  Features["avx512er"]   = HasLeaf7 && ((EBX >> 27) & 1) && HasAVX512Save;
   Features["avx512cd"]   = HasLeaf7 && ((EBX >> 28) & 1) && HasAVX512Save;
   Features["sha"]        = HasLeaf7 && ((EBX >> 29) & 1);
   Features["avx512bw"]   = HasLeaf7 && ((EBX >> 30) & 1) && HasAVX512Save;
   Features["avx512vl"]   = HasLeaf7 && ((EBX >> 31) & 1) && HasAVX512Save;
 
-  Features["prefetchwt1"]     = HasLeaf7 && ((ECX >>  0) & 1);
   Features["avx512vbmi"]      = HasLeaf7 && ((ECX >>  1) & 1) && HasAVX512Save;
   Features["pku"]             = HasLeaf7 && ((ECX >>  4) & 1);
   Features["waitpkg"]         = HasLeaf7 && ((ECX >>  5) & 1);
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index 827bc5b44387..01d0c71c25a9 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -880,7 +880,7 @@ void RISCVISAInfo::updateImplication() {
   // implied
   if (!HasE && !HasI) {
     auto Version = findDefaultVersion("i");
-    addExtension("i", Version.value());
+    addExtension("i", *Version);
   }
 
   if (HasE && HasI)
@@ -906,7 +906,7 @@ void RISCVISAInfo::updateImplication() {
                     if (Exts.count(ImpliedExt))
                       return;
                     auto Version = findDefaultVersion(ImpliedExt);
-                    addExtension(ImpliedExt, Version.value());
+                    addExtension(ImpliedExt, *Version);
                     WorkList.insert(ImpliedExt);
                   });
   }
@@ -915,7 +915,7 @@ void RISCVISAInfo::updateImplication() {
   if (XLen == 32 && Exts.count("zce") && Exts.count("f") &&
       !Exts.count("zcf")) {
     auto Version = findDefaultVersion("zcf");
-    addExtension("zcf", Version.value());
+    addExtension("zcf", *Version);
   }
 }
 
@@ -942,7 +942,7 @@ void RISCVISAInfo::updateCombination() {
           });
       if (HasAllRequiredFeatures) {
         auto Version = findDefaultVersion(CombineExt);
-        addExtension(CombineExt, Version.value());
+        addExtension(CombineExt, *Version);
         MadeChange = true;
       }
     }
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index efe392b94545..e3802380d2be 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -95,9 +95,9 @@ constexpr FeatureBitset FeaturesBroadwell =
 
 // Intel Knights Landing and Knights Mill
 // Knights Landing has feature parity with Broadwell.
-constexpr FeatureBitset FeaturesKNL =
-    FeaturesBroadwell | FeatureAES | FeatureAVX512F | FeatureEVEX512 |
-    FeatureAVX512CD | FeatureAVX512ER | FeatureAVX512PF | FeaturePREFETCHWT1;
+constexpr FeatureBitset FeaturesKNL = FeaturesBroadwell | FeatureAES |
+                                      FeatureAVX512F | FeatureEVEX512 |
+                                      FeatureAVX512CD;
 constexpr FeatureBitset FeaturesKNM = FeaturesKNL | FeatureAVX512VPOPCNTDQ;
 
 // Intel Skylake processors.
@@ -500,7 +500,6 @@ constexpr FeatureBitset ImpliedFeaturesMOVDIRI = {};
 constexpr FeatureBitset ImpliedFeaturesPCONFIG = {};
 constexpr FeatureBitset ImpliedFeaturesPOPCNT = {};
 constexpr FeatureBitset ImpliedFeaturesPKU = {};
-constexpr FeatureBitset ImpliedFeaturesPREFETCHWT1 = {};
 constexpr FeatureBitset ImpliedFeaturesPRFCHW = {};
 constexpr FeatureBitset ImpliedFeaturesPTWRITE = {};
 constexpr FeatureBitset ImpliedFeaturesRDPID = {};
@@ -569,8 +568,6 @@ constexpr FeatureBitset ImpliedFeaturesSM4 = FeatureAVX2;
 constexpr FeatureBitset ImpliedFeaturesAVX512CD = FeatureAVX512F;
 constexpr FeatureBitset ImpliedFeaturesAVX512BW = FeatureAVX512F;
 constexpr FeatureBitset ImpliedFeaturesAVX512DQ = FeatureAVX512F;
-constexpr FeatureBitset ImpliedFeaturesAVX512ER = FeatureAVX512F;
-constexpr FeatureBitset ImpliedFeaturesAVX512PF = FeatureAVX512F;
 constexpr FeatureBitset ImpliedFeaturesAVX512VL = FeatureAVX512F;
 
 constexpr FeatureBitset ImpliedFeaturesAVX512BF16 = FeatureAVX512BW;
@@ -751,13 +748,13 @@ unsigned llvm::X86::getFeaturePriority(ProcessorFeatures Feat) {
 #ifndef NDEBUG
   // Check that priorities are set properly in the .def file. We expect that
   // "compat" features are assigned non-duplicate consecutive priorities
-  // starting from one (1, ..., 37) and multiple zeros.
+  // starting from one (1, ..., 35) and multiple zeros.
 #define X86_FEATURE_COMPAT(ENUM, STR, PRIORITY) PRIORITY,
   unsigned Priorities[] = {
 #include "llvm/TargetParser/X86TargetParser.def"
   };
   std::array<unsigned, std::size(Priorities)> HelperList;
-  const size_t MaxPriority = 37;
+  const size_t MaxPriority = 35;
   std::iota(HelperList.begin(), HelperList.begin() + MaxPriority + 1, 0);
   for (size_t i = MaxPriority + 1; i != std::size(Priorities); ++i)
     HelperList[i] = 0;
diff --git a/llvm/lib/Transforms/Coroutines/CoroElide.cpp b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
index bb244489e4c2..74b5ccb7b9b7 100644
--- a/llvm/lib/Transforms/Coroutines/CoroElide.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
@@ -464,13 +464,9 @@ bool CoroIdElider::attemptElide() {
   return true;
 }
 
-static bool declaresCoroElideIntrinsics(Module &M) {
-  return coro::declaresIntrinsics(M, {"llvm.coro.id", "llvm.coro.id.async"});
-}
-
 PreservedAnalyses CoroElidePass::run(Function &F, FunctionAnalysisManager &AM) {
   auto &M = *F.getParent();
-  if (!declaresCoroElideIntrinsics(M))
+  if (!coro::declaresIntrinsics(M, {"llvm.coro.id"}))
     return PreservedAnalyses::all();
 
   FunctionElideInfo FEI{&F};
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 08a4522e3fac..38b8dab984db 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/PtrUseVisitor.h"
 #include "llvm/Analysis/StackLifetime.h"
 #include "llvm/Config/llvm-config.h"
@@ -1440,17 +1441,22 @@ namespace {
 struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
   using Base = PtrUseVisitor<AllocaUseVisitor>;
   AllocaUseVisitor(const DataLayout &DL, const DominatorTree &DT,
-                   const CoroBeginInst &CB, const SuspendCrossingInfo &Checker,
+                   const coro::Shape &CoroShape,
+                   const SuspendCrossingInfo &Checker,
                    bool ShouldUseLifetimeStartInfo)
-      : PtrUseVisitor(DL), DT(DT), CoroBegin(CB), Checker(Checker),
-        ShouldUseLifetimeStartInfo(ShouldUseLifetimeStartInfo) {}
+      : PtrUseVisitor(DL), DT(DT), CoroShape(CoroShape), Checker(Checker),
+        ShouldUseLifetimeStartInfo(ShouldUseLifetimeStartInfo) {
+    for (AnyCoroSuspendInst *SuspendInst : CoroShape.CoroSuspends)
+      CoroSuspendBBs.insert(SuspendInst->getParent());
+  }
 
   void visit(Instruction &I) {
     Users.insert(&I);
     Base::visit(I);
     // If the pointer is escaped prior to CoroBegin, we have to assume it would
     // be written into before CoroBegin as well.
-    if (PI.isEscaped() && !DT.dominates(&CoroBegin, PI.getEscapingInst())) {
+    if (PI.isEscaped() &&
+        !DT.dominates(CoroShape.CoroBegin, PI.getEscapingInst())) {
       MayWriteBeforeCoroBegin = true;
     }
   }
@@ -1553,10 +1559,19 @@ struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
     // When we found the lifetime markers refers to a
     // subrange of the original alloca, ignore the lifetime
     // markers to avoid misleading the analysis.
-    if (II.getIntrinsicID() != Intrinsic::lifetime_start || !IsOffsetKnown ||
-        !Offset.isZero())
+    if (!IsOffsetKnown || !Offset.isZero())
+      return Base::visitIntrinsicInst(II);
+    switch (II.getIntrinsicID()) {
+    default:
       return Base::visitIntrinsicInst(II);
-    LifetimeStarts.insert(&II);
+    case Intrinsic::lifetime_start:
+      LifetimeStarts.insert(&II);
+      LifetimeStartBBs.push_back(II.getParent());
+      break;
+    case Intrinsic::lifetime_end:
+      LifetimeEndBBs.insert(II.getParent());
+      break;
+    }
   }
 
   void visitCallBase(CallBase &CB) {
@@ -1586,7 +1601,7 @@ struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
 
 private:
   const DominatorTree &DT;
-  const CoroBeginInst &CoroBegin;
+  const coro::Shape &CoroShape;
   const SuspendCrossingInfo &Checker;
   // All alias to the original AllocaInst, created before CoroBegin and used
   // after CoroBegin. Each entry contains the instruction and the offset in the
@@ -1594,6 +1609,9 @@ private:
   DenseMap<Instruction *, std::optional<APInt>> AliasOffetMap{};
   SmallPtrSet<Instruction *, 4> Users{};
   SmallPtrSet<IntrinsicInst *, 2> LifetimeStarts{};
+  SmallVector<BasicBlock *> LifetimeStartBBs{};
+  SmallPtrSet<BasicBlock *, 2> LifetimeEndBBs{};
+  SmallPtrSet<const BasicBlock *, 2> CoroSuspendBBs{};
   bool MayWriteBeforeCoroBegin{false};
   bool ShouldUseLifetimeStartInfo{true};
 
@@ -1605,10 +1623,19 @@ private:
     // every basic block that uses the pointer to see if they cross suspension
     // points. The uses cover both direct uses as well as indirect uses.
     if (ShouldUseLifetimeStartInfo && !LifetimeStarts.empty()) {
-      for (auto *I : Users)
-        for (auto *S : LifetimeStarts)
-          if (Checker.isDefinitionAcrossSuspend(*S, I))
-            return true;
+      // If there is no explicit lifetime.end, then assume the address can
+      // cross suspension points.
+      if (LifetimeEndBBs.empty())
+        return true;
+
+      // If there is a path from a lifetime.start to a suspend without a
+      // corresponding lifetime.end, then the alloca's lifetime persists
+      // beyond that suspension point and the alloca must go on the frame.
+      llvm::SmallVector<BasicBlock *> Worklist(LifetimeStartBBs);
+      if (isManyPotentiallyReachableFromMany(Worklist, CoroSuspendBBs,
+                                             &LifetimeEndBBs, &DT))
+        return true;
+
       // Addresses are guaranteed to be identical after every lifetime.start so
       // we cannot use the local stack if the address escaped and there is a
       // suspend point between lifetime markers. This should also cover the
@@ -1646,13 +1673,13 @@ private:
   }
 
   void handleMayWrite(const Instruction &I) {
-    if (!DT.dominates(&CoroBegin, &I))
+    if (!DT.dominates(CoroShape.CoroBegin, &I))
       MayWriteBeforeCoroBegin = true;
   }
 
   bool usedAfterCoroBegin(Instruction &I) {
     for (auto &U : I.uses())
-      if (DT.dominates(&CoroBegin, U))
+      if (DT.dominates(CoroShape.CoroBegin, U))
         return true;
     return false;
   }
@@ -1661,7 +1688,7 @@ private:
     // We track all aliases created prior to CoroBegin but used after.
     // These aliases may need to be recreated after CoroBegin if the alloca
     // need to live on the frame.
-    if (DT.dominates(&CoroBegin, &I) || !usedAfterCoroBegin(I))
+    if (DT.dominates(CoroShape.CoroBegin, &I) || !usedAfterCoroBegin(I))
       return;
 
     if (!IsOffsetKnown) {
@@ -2830,8 +2857,7 @@ static void collectFrameAlloca(AllocaInst *AI, coro::Shape &Shape,
   bool ShouldUseLifetimeStartInfo =
       (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon &&
        Shape.ABI != coro::ABI::RetconOnce);
-  AllocaUseVisitor Visitor{AI->getModule()->getDataLayout(), DT,
-                           *Shape.CoroBegin, Checker,
+  AllocaUseVisitor Visitor{AI->getModule()->getDataLayout(), DT, Shape, Checker,
                            ShouldUseLifetimeStartInfo};
   Visitor.visitPtr(*AI);
   if (!Visitor.getShouldLiveOnFrame())
@@ -2948,10 +2974,12 @@ void coro::salvageDebugInfo(
     std::optional<BasicBlock::iterator> InsertPt;
     if (auto *I = dyn_cast<Instruction>(Storage)) {
       InsertPt = I->getInsertionPointAfterDef();
-      // Update DILocation only in O0 since it is easy to get out of sync in
-      // optimizations. See https://github.com/llvm/llvm-project/pull/75104 for
-      // an example.
-      if (!OptimizeFrame && I->getDebugLoc())
+      // Update DILocation only if variable was not inlined.
+      DebugLoc ILoc = I->getDebugLoc();
+      DebugLoc DVILoc = DVI.getDebugLoc();
+      if (ILoc && DVILoc &&
+          DVILoc->getScope()->getSubprogram() ==
+              ILoc->getScope()->getSubprogram())
         DVI.setDebugLoc(I->getDebugLoc());
     } else if (isa<Argument>(Storage))
       InsertPt = F->getEntryBlock().begin();
@@ -2988,11 +3016,13 @@ void coro::salvageDebugInfo(
     std::optional<BasicBlock::iterator> InsertPt;
     if (auto *I = dyn_cast<Instruction>(Storage)) {
       InsertPt = I->getInsertionPointAfterDef();
-      // Update DILocation only in O0 since it is easy to get out of sync in
-      // optimizations. See https://github.com/llvm/llvm-project/pull/75104 for
-      // an example.
-      if (!OptimizeFrame && I->getDebugLoc())
-        DVR.setDebugLoc(I->getDebugLoc());
+      // Update DILocation only if variable was not inlined.
+      DebugLoc ILoc = I->getDebugLoc();
+      DebugLoc DVRLoc = DVR.getDebugLoc();
+      if (ILoc && DVRLoc &&
+          DVRLoc->getScope()->getSubprogram() ==
+              ILoc->getScope()->getSubprogram())
+        DVR.setDebugLoc(ILoc);
     } else if (isa<Argument>(Storage))
       InsertPt = F->getEntryBlock().begin();
     if (InsertPt) {
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 1d9cf185b75a..5a58a99d2879 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -227,6 +227,7 @@ static void lowerAwaitSuspend(IRBuilder<> &Builder, CoroAwaitSuspendInst *CB,
     FunctionType *ResumeTy = FunctionType::get(
         Type::getVoidTy(Ctx), PointerType::getUnqual(Ctx), false);
     auto *ResumeCall = Builder.CreateCall(ResumeTy, ResumeAddr, {NewCall});
+    ResumeCall->setCallingConv(CallingConv::Fast);
 
     // We can't insert the 'ret' instruction and adjust the cc until the
     // function has been split, so remember this for later.
@@ -1088,7 +1089,6 @@ void CoroCloner::create() {
   // Turn symmetric transfers into musttail calls.
   for (CallInst *ResumeCall : Shape.SymmetricTransfers) {
     ResumeCall = cast<CallInst>(VMap[ResumeCall]);
-    ResumeCall->setCallingConv(NewF->getCallingConv());
     if (TTI.supportsTailCallFor(ResumeCall)) {
       // FIXME: Could we support symmetric transfer effectively without
       // musttail?
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index e3920b9e1d2b..b6866580ccd3 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -3954,7 +3954,7 @@ static bool runAttributorLightOnFunctions(InformationCache &InfoCache,
     // We look at internal functions only on-demand but if any use is not a
     // direct call or outside the current set of analyzed functions, we have
     // to do it eagerly.
-    if (F->hasLocalLinkage()) {
+    if (AC.UseLiveness && F->hasLocalLinkage()) {
       if (llvm::all_of(F->uses(), [&Functions](const Use &U) {
             const auto *CB = dyn_cast<CallBase>(U.getUser());
             return CB && CB->isCallee(&U) &&
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 41b66aafe7d3..1b3bf3c732ed 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -5690,6 +5690,9 @@ bool AANoCapture::isImpliedByIR(Attributor &A, const IRPosition &IRP,
     return V.use_empty();
 
   // You cannot "capture" null in the default address space.
+  //
+  // FIXME: This should use NullPointerIsDefined to account for the function
+  // attribute.
   if (isa<UndefValue>(V) || (isa<ConstantPointerNull>(V) &&
                              V.getType()->getPointerAddressSpace() == 0)) {
     return true;
@@ -5899,10 +5902,13 @@ ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) {
 
   const Function *F =
       isArgumentPosition() ? IRP.getAssociatedFunction() : IRP.getAnchorScope();
-  assert(F && "Expected a function!");
-  const IRPosition &FnPos = IRPosition::function(*F);
+
+  // TODO: Is the checkForAllUses below useful for constants?
+  if (!F)
+    return indicatePessimisticFixpoint();
 
   AANoCapture::StateType T;
+  const IRPosition &FnPos = IRPosition::function(*F);
 
   // Readonly means we cannot capture through memory.
   bool IsKnown;
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index a116fd653534..cb19bf2a4ae1 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -1435,7 +1435,8 @@ void llvm::gatherImportedSummariesForModule(
     StringRef ModulePath,
     const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
     const FunctionImporter::ImportMapTy &ImportList,
-    std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex) {
+    std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex,
+    GVSummaryPtrSet &DecSummaries) {
   // Include all summaries from the importing module.
   ModuleToSummariesForIndex[std::string(ModulePath)] =
       ModuleToDefinedGVSummaries.lookup(ModulePath);
@@ -1450,7 +1451,7 @@ void llvm::gatherImportedSummariesForModule(
       assert(DS != DefinedGVSummaries.end() &&
              "Expected a defined summary for imported global value");
       if (Type == GlobalValueSummary::Declaration)
-        continue;
+        DecSummaries.insert(DS->second);
 
       SummariesForIndex[GUID] = DS->second;
     }
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index b9d84d583f49..c53b9451625c 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -1889,15 +1889,17 @@ bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
       } else if (findProfiledCalleeThroughTailCalls(
                      ProfiledCallee, CalledFunction, Depth + 1,
                      FoundCalleeChain, FoundMultipleCalleeChains)) {
-        if (FoundMultipleCalleeChains)
-          return false;
+        // findProfiledCalleeThroughTailCalls should not have returned
+        // true if FoundMultipleCalleeChains.
+        assert(!FoundMultipleCalleeChains);
         if (FoundSingleCalleeChain) {
           FoundMultipleCalleeChains = true;
           return false;
         }
         FoundSingleCalleeChain = true;
         SaveCallsiteInfo(&I, CalleeFunc);
-      }
+      } else if (FoundMultipleCalleeChains)
+        return false;
     }
   }
 
@@ -2004,8 +2006,9 @@ bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
       } else if (findProfiledCalleeThroughTailCalls(
                      ProfiledCallee, CallEdge.first, Depth + 1,
                      FoundCalleeChain, FoundMultipleCalleeChains)) {
-        if (FoundMultipleCalleeChains)
-          return false;
+        // findProfiledCalleeThroughTailCalls should not have returned
+        // true if FoundMultipleCalleeChains.
+        assert(!FoundMultipleCalleeChains);
         if (FoundSingleCalleeChain) {
           FoundMultipleCalleeChains = true;
           return false;
@@ -2015,7 +2018,8 @@ bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
         // Add FS to FSToVIMap  in case it isn't already there.
         assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
         FSToVIMap[FS] = FSVI;
-      }
+      } else if (FoundMultipleCalleeChains)
+        return false;
     }
   }
 
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index eea9399127e8..e3a4821b8226 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -4238,7 +4238,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
           ORA << "Value has potential side effects preventing SPMD-mode "
                  "execution";
           if (isa<CallBase>(NonCompatibleI)) {
-            ORA << ". Add `__attribute__((assume(\"ompx_spmd_amenable\")))` to "
+            ORA << ". Add `[[omp::assume(\"ompx_spmd_amenable\")]]` to "
                    "the called function to override";
           }
           return ORA << ".";
@@ -4380,7 +4380,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
           continue;
         auto Remark = [&](OptimizationRemarkAnalysis ORA) {
           return ORA << "Call may contain unknown parallel regions. Use "
-                     << "`__attribute__((assume(\"omp_no_parallelism\")))` to "
+                     << "`[[omp::assume(\"omp_no_parallelism\")]]` to "
                         "override.";
         };
         A.emitRemark<OptimizationRemarkAnalysis>(UnknownParallelRegionCB,
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 542a1c82b127..430f3e12fa5b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -214,6 +214,9 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
     // Find out if the comparison would be true or false for the i'th element.
     Constant *C = ConstantFoldCompareInstOperands(ICI.getPredicate(), Elt,
                                                   CompareRHS, DL, &TLI);
+    if (!C)
+      return nullptr;
+
     // If the result is undef for this element, ignore it.
     if (isa<UndefValue>(C)) {
       // Extend range state machines to cover this element in case there is an
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 4351a55ea1d3..832f89ed0b64 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -332,7 +332,7 @@ bool PointerReplacer::collectUsersRecursive(Instruction &I) {
       Worklist.insert(SI);
       if (!collectUsersRecursive(*SI))
         return false;
-    } else if (isa<GetElementPtrInst, BitCastInst>(Inst)) {
+    } else if (isa<GetElementPtrInst>(Inst)) {
       Worklist.insert(Inst);
       if (!collectUsersRecursive(*Inst))
         return false;
@@ -393,15 +393,6 @@ void PointerReplacer::replace(Instruction *I) {
     NewI->takeName(GEP);
     NewI->setIsInBounds(GEP->isInBounds());
     WorkMap[GEP] = NewI;
-  } else if (auto *BC = dyn_cast<BitCastInst>(I)) {
-    auto *V = getReplacement(BC->getOperand(0));
-    assert(V && "Operand not replaced");
-    auto *NewT = PointerType::get(BC->getType()->getContext(),
-                                  V->getType()->getPointerAddressSpace());
-    auto *NewI = new BitCastInst(V, NewT);
-    IC.InsertNewInstWith(NewI, BC->getIterator());
-    NewI->takeName(BC);
-    WorkMap[BC] = NewI;
   } else if (auto *SI = dyn_cast<SelectInst>(I)) {
     auto *NewSI = SelectInst::Create(
         SI->getCondition(), getReplacement(SI->getTrueValue()),
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 6c25ff215c37..eb48157af009 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -5000,31 +5000,24 @@ bool InstCombinerImpl::run() {
       BasicBlock *UserParent = nullptr;
       unsigned NumUsers = 0;
 
-      for (auto *U : I->users()) {
-        if (U->isDroppable())
+      for (Use &U : I->uses()) {
+        User *User = U.getUser();
+        if (User->isDroppable())
           continue;
         if (NumUsers > MaxSinkNumUsers)
           return std::nullopt;
 
-        Instruction *UserInst = cast<Instruction>(U);
+        Instruction *UserInst = cast<Instruction>(User);
         // Special handling for Phi nodes - get the block the use occurs in.
-        if (PHINode *PN = dyn_cast<PHINode>(UserInst)) {
-          for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
-            if (PN->getIncomingValue(i) == I) {
-              // Bail out if we have uses in different blocks. We don't do any
-              // sophisticated analysis (i.e finding NearestCommonDominator of
-              // these use blocks).
-              if (UserParent && UserParent != PN->getIncomingBlock(i))
-                return std::nullopt;
-              UserParent = PN->getIncomingBlock(i);
-            }
-          }
-          assert(UserParent && "expected to find user block!");
-        } else {
-          if (UserParent && UserParent != UserInst->getParent())
-            return std::nullopt;
-          UserParent = UserInst->getParent();
-        }
+        BasicBlock *UserBB = UserInst->getParent();
+        if (PHINode *PN = dyn_cast<PHINode>(UserInst))
+          UserBB = PN->getIncomingBlock(U);
+        // Bail out if we have uses in different blocks. We don't do any
+        // sophisticated analysis (i.e finding NearestCommonDominator of these
+        // use blocks).
+        if (UserParent && UserParent != UserBB)
+          return std::nullopt;
+        UserParent = UserBB;
 
         // Make sure these checks are done only once, naturally we do the checks
         // the first time we get the userparent, this will save compile time.
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 8d39217992c7..2aa21759d56e 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1589,6 +1589,14 @@ void HWAddressSanitizer::sanitizeFunction(Function &F,
 
   assert(!ShadowBase);
 
+  // Remove memory attributes that are about to become invalid.
+  // HWASan checks read from shadow, which invalidates memory(argmem: *)
+  // Short granule checks on function arguments read from the argument memory
+  // (last byte of the granule), which invalidates writeonly.
+  F.removeFnAttr(llvm::Attribute::Memory);
+  for (auto &A : F.args())
+    A.removeAttr(llvm::Attribute::WriteOnly);
+
   BasicBlock::iterator InsertPt = F.getEntryBlock().begin();
   IRBuilder<> EntryIRB(&F.getEntryBlock(), InsertPt);
   emitPrologue(EntryIRB,
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 7e48c28176bd..70bfa469193b 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -554,6 +554,12 @@ static Decomposition decompose(Value *V,
     V = Op0;
   }
 
+  if (match(V, m_SExt(m_Value(Op0)))) {
+    V = Op0;
+    Preconditions.emplace_back(CmpInst::ICMP_SGE, Op0,
+                               ConstantInt::get(Op0->getType(), 0));
+  }
+
   Value *Op1;
   ConstantInt *CI;
   if (match(V, m_NUWAdd(m_Value(Op0), m_Value(Op1)))) {
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index ce40e8b31b76..4f36bac11e34 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -43,7 +43,7 @@ static ValueLatticeElement::MergeOptions getMaxWidenStepsOpts() {
 }
 
 static ConstantRange getConstantRange(const ValueLatticeElement &LV, Type *Ty,
-                                      bool UndefAllowed = true) {
+                                      bool UndefAllowed) {
   assert(Ty->isIntOrIntVectorTy() && "Should be int or int vector");
   if (LV.isConstantRange(UndefAllowed))
     return LV.getConstantRange();
@@ -1297,7 +1297,8 @@ void SCCPInstVisitor::visitCastInst(CastInst &I) {
 
   if (I.getDestTy()->isIntegerTy() && I.getSrcTy()->isIntOrIntVectorTy()) {
     auto &LV = getValueState(&I);
-    ConstantRange OpRange = getConstantRange(OpSt, I.getSrcTy());
+    ConstantRange OpRange =
+        getConstantRange(OpSt, I.getSrcTy(), /*UndefAllowed=*/false);
 
     Type *DestTy = I.getDestTy();
     // Vectors where all elements have the same known constant range are treated
@@ -1329,8 +1330,8 @@ void SCCPInstVisitor::handleExtractOfWithOverflow(ExtractValueInst &EVI,
     return; // Wait to resolve.
 
   Type *Ty = LHS->getType();
-  ConstantRange LR = getConstantRange(L, Ty);
-  ConstantRange RR = getConstantRange(R, Ty);
+  ConstantRange LR = getConstantRange(L, Ty, /*UndefAllowed=*/false);
+  ConstantRange RR = getConstantRange(R, Ty, /*UndefAllowed=*/false);
   if (Idx == 0) {
     ConstantRange Res = LR.binaryOp(WO->getBinaryOp(), RR);
     mergeInValue(&EVI, ValueLatticeElement::getRange(Res));
@@ -1534,8 +1535,10 @@ void SCCPInstVisitor::visitBinaryOperator(Instruction &I) {
     return markOverdefined(&I);
 
   // Try to simplify to a constant range.
-  ConstantRange A = getConstantRange(V1State, I.getType());
-  ConstantRange B = getConstantRange(V2State, I.getType());
+  ConstantRange A =
+      getConstantRange(V1State, I.getType(), /*UndefAllowed=*/false);
+  ConstantRange B =
+      getConstantRange(V2State, I.getType(), /*UndefAllowed=*/false);
 
   auto *BO = cast<BinaryOperator>(&I);
   ConstantRange R = ConstantRange::getEmpty(I.getType()->getScalarSizeInBits());
@@ -1818,7 +1821,8 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
 
         // Combine range info for the original value with the new range from the
         // condition.
-        auto CopyOfCR = getConstantRange(CopyOfVal, CopyOf->getType());
+        auto CopyOfCR = getConstantRange(CopyOfVal, CopyOf->getType(),
+                                         /*UndefAllowed=*/true);
         auto NewCR = ImposedCR.intersectWith(CopyOfCR);
         // If the existing information is != x, do not use the information from
         // a chained predicate, as the != x information is more likely to be
@@ -1863,7 +1867,8 @@ void SCCPInstVisitor::handleCallResult(CallBase &CB) {
         const ValueLatticeElement &State = getValueState(Op);
         if (State.isUnknownOrUndef())
           return;
-        OpRanges.push_back(getConstantRange(State, Op->getType()));
+        OpRanges.push_back(
+            getConstantRange(State, Op->getType(), /*UndefAllowed=*/false));
       }
 
       ConstantRange Result =
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 93701b2a7791..fe6ec8819ff9 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -5501,11 +5501,13 @@ static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) {
 }
 
 static void createUnreachableSwitchDefault(SwitchInst *Switch,
-                                           DomTreeUpdater *DTU) {
+                                           DomTreeUpdater *DTU,
+                                           bool RemoveOrigDefaultBlock = true) {
   LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
   auto *BB = Switch->getParent();
   auto *OrigDefaultBlock = Switch->getDefaultDest();
-  OrigDefaultBlock->removePredecessor(BB);
+  if (RemoveOrigDefaultBlock)
+    OrigDefaultBlock->removePredecessor(BB);
   BasicBlock *NewDefaultBlock = BasicBlock::Create(
       BB->getContext(), BB->getName() + ".unreachabledefault", BB->getParent(),
       OrigDefaultBlock);
@@ -5514,7 +5516,8 @@ static void createUnreachableSwitchDefault(SwitchInst *Switch,
   if (DTU) {
     SmallVector<DominatorTree::UpdateType, 2> Updates;
     Updates.push_back({DominatorTree::Insert, BB, &*NewDefaultBlock});
-    if (!is_contained(successors(BB), OrigDefaultBlock))
+    if (RemoveOrigDefaultBlock &&
+        !is_contained(successors(BB), OrigDefaultBlock))
       Updates.push_back({DominatorTree::Delete, BB, &*OrigDefaultBlock});
     DTU->applyUpdates(Updates);
   }
@@ -5696,10 +5699,33 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
       Known.getBitWidth() - (Known.Zero | Known.One).popcount();
   assert(NumUnknownBits <= Known.getBitWidth());
   if (HasDefault && DeadCases.empty() &&
-      NumUnknownBits < 64 /* avoid overflow */ &&
-      SI->getNumCases() == (1ULL << NumUnknownBits)) {
-    createUnreachableSwitchDefault(SI, DTU);
-    return true;
+      NumUnknownBits < 64 /* avoid overflow */) {
+    uint64_t AllNumCases = 1ULL << NumUnknownBits;
+    if (SI->getNumCases() == AllNumCases) {
+      createUnreachableSwitchDefault(SI, DTU);
+      return true;
+    }
+    // When only one case value is missing, replace default with that case.
+    // Eliminating the default branch will provide more opportunities for
+    // optimization, such as lookup tables.
+    if (SI->getNumCases() == AllNumCases - 1) {
+      assert(NumUnknownBits > 1 && "Should be canonicalized to a branch");
+      IntegerType *CondTy = cast<IntegerType>(Cond->getType());
+      if (CondTy->getIntegerBitWidth() > 64 ||
+          !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth()))
+        return false;
+
+      uint64_t MissingCaseVal = 0;
+      for (const auto &Case : SI->cases())
+        MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue();
+      auto *MissingCase =
+          cast<ConstantInt>(ConstantInt::get(Cond->getType(), MissingCaseVal));
+      SwitchInstProfUpdateWrapper SIW(*SI);
+      SIW.addCase(MissingCase, SI->getDefaultDest(), SIW.getSuccessorWeight(0));
+      createUnreachableSwitchDefault(SI, DTU, /*RemoveOrigDefaultBlock*/ false);
+      SIW.setSuccessorWeight(0, 0);
+      return true;
+    }
   }
 
   if (DeadCases.empty())
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6d64aaa75922..48981a6bd39e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3384,18 +3384,6 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
                                    TargetTransformInfo::TCK_RecipThroughput);
 }
 
-static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
-  auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
-  auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
-  return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
-}
-
-static Type *largestIntegerVectorType(Type *T1, Type *T2) {
-  auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
-  auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
-  return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
-}
-
 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
                                             VPlan &Plan) {
   // Fix widened non-induction PHIs by setting up the PHI operands.
@@ -7120,26 +7108,12 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
       return *RedCost;
 
     Type *SrcScalarTy = I->getOperand(0)->getType();
+    Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
+    if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
+      SrcScalarTy =
+          IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);
     Type *SrcVecTy =
         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
-    if (canTruncateToMinimalBitwidth(I, VF)) {
-      // This cast is going to be shrunk. This may remove the cast or it might
-      // turn it into slightly different cast. For example, if MinBW == 16,
-      // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
-      //
-      // Calculate the modified src and dest types.
-      Type *MinVecTy = VectorTy;
-      if (Opcode == Instruction::Trunc) {
-        SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
-        VectorTy =
-            largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
-      } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
-        // Leave SrcVecTy unchanged - we only shrink the destination element
-        // type.
-        VectorTy =
-            smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
-      }
-    }
 
     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
   }
@@ -7533,8 +7507,9 @@ LoopVectorizationPlanner::executePlan(
   LLVM_DEBUG(BestVPlan.dump());
 
   // Perform the actual loop transformation.
-  VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
-                         OrigLoop->getHeader()->getContext());
+  VPTransformState State(BestVF, BestUF, LI,
+                         EnableVPlanNativePath ? nullptr : DT, ILV.Builder,
+                         &ILV, &BestVPlan, OrigLoop->getHeader()->getContext());
 
   // 0. Generate SCEV-dependent code into the preheader, including TripCount,
   // before making any changes to the CFG.
@@ -8157,8 +8132,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
 static VPWidenIntOrFpInductionRecipe *
 createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
                             VPValue *Start, const InductionDescriptor &IndDesc,
-                            VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop,
-                            VFRange &Range) {
+                            VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
   assert(IndDesc.getStartValue() ==
          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
@@ -8180,7 +8154,7 @@ VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
   // produces its scalar and vector values.
   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, Plan,
-                                       *PSE.getSE(), *OrigLoop, Range);
+                                       *PSE.getSE(), *OrigLoop);
 
   // Check if this is pointer induction. If so, build the recipe for it.
   if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
@@ -8220,7 +8194,7 @@ VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
     VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());
     return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
-                                       *OrigLoop, Range);
+                                       *OrigLoop);
   }
   return nullptr;
 }
@@ -8561,8 +8535,10 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
             *Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext());
       VPlanTransforms::optimize(*Plan, *PSE.getSE());
       // TODO: try to put it close to addActiveLaneMask().
-      if (CM.foldTailWithEVL())
-        VPlanTransforms::addExplicitVectorLength(*Plan);
+      // Discard the plan if it is not EVL-compatible
+      if (CM.foldTailWithEVL() &&
+          !VPlanTransforms::tryAddExplicitVectorLength(*Plan))
+        break;
       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
       VPlans.push_back(std::move(Plan));
     }
@@ -10402,6 +10378,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
       PA.preserve<DominatorTreeAnalysis>();
       PA.preserve<ScalarEvolutionAnalysis>();
     }
+
     PA.preserve<LoopAnalysis>();
 
     if (Result.MadeCFGChange) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 140a1b1ffbaf..f044a8cdd2f3 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -255,6 +255,21 @@ static bool isVectorLikeInstWithConstOps(Value *V) {
   return isConstant(I->getOperand(2));
 }
 
+/// Returns power-of-2 number of elements in a single register (part), given the
+/// total number of elements \p Size and number of registers (parts) \p
+/// NumParts.
+static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
+  return PowerOf2Ceil(divideCeil(Size, NumParts));
+}
+
+/// Returns correct remaining number of elements, considering total amount \p
+/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
+/// and current register (part) \p Part.
+static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
+                            unsigned Part) {
+  return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
+}
+
 #if !defined(NDEBUG)
 /// Print a short descriptor of the instruction bundle suitable for debug output.
 static std::string shortBundleName(ArrayRef<Value *> VL) {
@@ -502,6 +517,15 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
       cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
   Value *Vec1 = nullptr;
   Value *Vec2 = nullptr;
+  bool HasNonUndefVec = any_of(VL, [](Value *V) {
+    auto *EE = dyn_cast<ExtractElementInst>(V);
+    if (!EE)
+      return false;
+    Value *Vec = EE->getVectorOperand();
+    if (isa<UndefValue>(Vec))
+      return false;
+    return isGuaranteedNotToBePoison(Vec);
+  });
   enum ShuffleMode { Unknown, Select, Permute };
   ShuffleMode CommonShuffleMode = Unknown;
   Mask.assign(VL.size(), PoisonMaskElem);
@@ -514,21 +538,27 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
       return std::nullopt;
     auto *Vec = EI->getVectorOperand();
     // We can extractelement from undef or poison vector.
-    if (isUndefVector(Vec).all())
+    if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
       continue;
     // All vector operands must have the same number of vector elements.
-    if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
-      return std::nullopt;
-    if (isa<UndefValue>(EI->getIndexOperand()))
-      continue;
-    auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
-    if (!Idx)
-      return std::nullopt;
-    // Undefined behavior if Idx is negative or >= Size.
-    if (Idx->getValue().uge(Size))
+    if (isa<UndefValue>(Vec)) {
+      Mask[I] = I;
+    } else {
+      if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
+        return std::nullopt;
+      if (isa<UndefValue>(EI->getIndexOperand()))
+        continue;
+      auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
+      if (!Idx)
+        return std::nullopt;
+      // Undefined behavior if Idx is negative or >= Size.
+      if (Idx->getValue().uge(Size))
+        continue;
+      unsigned IntIdx = Idx->getValue().getZExtValue();
+      Mask[I] = IntIdx;
+    }
+    if (isUndefVector(Vec).all() && HasNonUndefVec)
       continue;
-    unsigned IntIdx = Idx->getValue().getZExtValue();
-    Mask[I] = IntIdx;
     // For correct shuffling we have to have at most 2 different vector operands
     // in all extractelement instructions.
     if (!Vec1 || Vec1 == Vec) {
@@ -543,7 +573,7 @@ isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
       continue;
     // If the extract index is not the same as the operation number, it is a
     // permutation.
-    if (IntIdx != I) {
+    if (Mask[I] % Size != I) {
       CommonShuffleMode = Permute;
       continue;
     }
@@ -4066,7 +4096,8 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
       const int VF = GetVF(I);
       if (VF == 0)
         continue;
-      MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, PartSz);
+      unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
+      MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
       // Shuffle of at least 2 vectors - ignore.
       if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
         std::fill(Slice.begin(), Slice.end(), NumScalars);
@@ -4076,7 +4107,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
       // Try to include as much elements from the mask as possible.
       int FirstMin = INT_MAX;
       int SecondVecFound = false;
-      for (int K : seq<int>(0, PartSz)) {
+      for (int K : seq<int>(Limit)) {
         int Idx = Mask[I * PartSz + K];
         if (Idx == PoisonMaskElem) {
           Value *V = GatheredScalars[I * PartSz + K];
@@ -4101,7 +4132,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
         ShuffledSubMasks.set(I);
         continue;
       }
-      for (int K : seq<int>(0, PartSz)) {
+      for (int K : seq<int>(Limit)) {
         int Idx = Mask[I * PartSz + K];
         if (Idx == PoisonMaskElem)
           continue;
@@ -4124,14 +4155,15 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
       }
     }
   };
-  int PartSz = NumScalars / NumParts;
+  int PartSz = getPartNumElems(NumScalars, NumParts);
   if (!ExtractShuffles.empty())
     TransformMaskToOrder(
         CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
           if (!ExtractShuffles[I])
             return 0U;
           unsigned VF = 0;
-          for (unsigned Idx : seq<unsigned>(0, PartSz)) {
+          unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
+          for (unsigned Idx : seq<unsigned>(Sz)) {
             int K = I * PartSz + Idx;
             if (ExtractMask[K] == PoisonMaskElem)
               continue;
@@ -4762,12 +4794,13 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
       ::addMask(ReorderMask, TE.ReuseShuffleIndices);
       unsigned VF = ReorderMask.size();
       OrdersType ResOrder(VF, VF);
-      unsigned NumParts = VF / Sz;
+      unsigned NumParts = divideCeil(VF, Sz);
       SmallBitVector UsedVals(NumParts);
       for (unsigned I = 0; I < VF; I += Sz) {
         int Val = PoisonMaskElem;
         unsigned UndefCnt = 0;
-        if (any_of(ArrayRef(ReorderMask).slice(I, Sz),
+        unsigned Limit = std::min(Sz, VF - I);
+        if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
                    [&](int Idx) {
                      if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
                        Val = Idx;
@@ -6861,23 +6894,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     case Instruction::ExtractElement: {
       if (CurrentOrder.empty()) {
         LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
-        newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                     ReuseShuffleIndices);
-        // This is a special case, as it does not gather, but at the same time
-        // we are not extending buildTree_rec() towards the operands.
-        ValueList Op0;
-        Op0.assign(VL.size(), VL0->getOperand(0));
-        VectorizableTree.back()->setOperand(0, Op0);
-        return;
+      } else {
+        LLVM_DEBUG({
+          dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
+                    "with order";
+          for (unsigned Idx : CurrentOrder)
+            dbgs() << " " << Idx;
+          dbgs() << "\n";
+        });
+        fixupOrderingIndices(CurrentOrder);
       }
-      LLVM_DEBUG({
-        dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
-                  "with order";
-        for (unsigned Idx : CurrentOrder)
-          dbgs() << " " << Idx;
-        dbgs() << "\n";
-      });
-      fixupOrderingIndices(CurrentOrder);
       // Insert new order with initial value 0, if it does not exist,
       // otherwise return the iterator to the existing one.
       newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
@@ -6931,28 +6957,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       fixupOrderingIndices(CurrentOrder);
       switch (State) {
       case TreeEntry::Vectorize:
-        if (CurrentOrder.empty()) {
-          // Original loads are consecutive and does not require reordering.
-          TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                            ReuseShuffleIndices);
+        TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+                          ReuseShuffleIndices, CurrentOrder);
+        if (CurrentOrder.empty())
           LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
-        } else {
-          // Need to reorder.
-          TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                            ReuseShuffleIndices, CurrentOrder);
+        else
           LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
-        }
         TE->setOperandsInOrder();
         break;
       case TreeEntry::StridedVectorize:
         // Vectorizing non-consecutive loads with `llvm.masked.gather`.
-        if (CurrentOrder.empty()) {
-          TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
-                            UserTreeIdx, ReuseShuffleIndices);
-        } else {
-          TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
-                            UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
-        }
+        TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
+                          UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
         TE->setOperandsInOrder();
         LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
         break;
@@ -7966,6 +7982,10 @@ void BoUpSLP::transformNodes() {
     TreeEntry &E = *TE.get();
     switch (E.getOpcode()) {
     case Instruction::Load: {
+      // No need to reorder masked gather loads, just reorder the scalar
+      // operands.
+      if (E.State != TreeEntry::Vectorize)
+        break;
       Type *ScalarTy = E.getMainOp()->getType();
       auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size());
       Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
@@ -8279,19 +8299,18 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
             return Sz;
           return std::max(Sz, VecTy->getNumElements());
         });
-    unsigned NumSrcRegs =
-        TTI.getNumberOfParts(FixedVectorType::get(ScalarTy, NumElts));
-    if (NumSrcRegs == 0)
-      NumSrcRegs = 1;
     // FIXME: this must be moved to TTI for better estimation.
-    unsigned EltsPerVector = PowerOf2Ceil(std::max(
-        divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
+    unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
     auto CheckPerRegistersShuffle =
-        [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
+        [&](MutableArrayRef<int> Mask,
+            SmallVector<int> Indices) -> std::optional<TTI::ShuffleKind> {
+      if (NumElts <= EltsPerVector)
+        return std::nullopt;
       DenseSet<int> RegIndices;
       // Check that if trying to permute same single/2 input vectors.
       TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
       int FirstRegId = -1;
+      Indices.assign(1, -1);
       for (int &I : Mask) {
         if (I == PoisonMaskElem)
           continue;
@@ -8301,8 +8320,15 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         RegIndices.insert(RegId);
         if (RegIndices.size() > 2)
           return std::nullopt;
-        if (RegIndices.size() == 2)
+        if (RegIndices.size() == 2) {
           ShuffleKind = TTI::SK_PermuteTwoSrc;
+          if (Indices.size() == 1)
+            Indices.push_back(-1);
+        }
+        if (RegId == FirstRegId)
+          Indices.front() = I % NumElts;
+        else
+          Indices.back() = I % NumElts;
         I = (I % NumElts) % EltsPerVector +
             (RegId == FirstRegId ? 0 : EltsPerVector);
       }
@@ -8313,22 +8339,23 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     // Process extracts in blocks of EltsPerVector to check if the source vector
     // operand can be re-used directly. If not, add the cost of creating a
     // shuffle to extract the values into a vector register.
-    for (unsigned Part = 0; Part < NumParts; ++Part) {
+    for (unsigned Part : seq<unsigned>(NumParts)) {
       if (!ShuffleKinds[Part])
         continue;
-      ArrayRef<int> MaskSlice =
-          Mask.slice(Part * EltsPerVector,
-                     (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
-                         ? Mask.size() % EltsPerVector
-                         : EltsPerVector);
+      ArrayRef<int> MaskSlice = Mask.slice(
+          Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
       SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
       copy(MaskSlice, SubMask.begin());
+      SmallVector<int> Indices;
       std::optional<TTI::ShuffleKind> RegShuffleKind =
-          CheckPerRegistersShuffle(SubMask);
+          CheckPerRegistersShuffle(SubMask, Indices);
       if (!RegShuffleKind) {
-        Cost += ::getShuffleCost(TTI, *ShuffleKinds[Part],
-                                 FixedVectorType::get(ScalarTy, NumElts),
-                                 MaskSlice);
+        if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
+            !ShuffleVectorInst::isIdentityMask(
+                MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
+          Cost += ::getShuffleCost(TTI, *ShuffleKinds[Part],
+                                   FixedVectorType::get(ScalarTy, NumElts),
+                                   MaskSlice);
         continue;
       }
       if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
@@ -8337,6 +8364,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                                  FixedVectorType::get(ScalarTy, EltsPerVector),
                                  SubMask);
       }
+      for (int Idx : Indices) {
+        Cost += ::getShuffleCost(TTI, TTI::SK_ExtractSubvector,
+                                 FixedVectorType::get(ScalarTy, NumElts),
+                                 std::nullopt, CostKind, Idx,
+                                 FixedVectorType::get(ScalarTy, EltsPerVector));
+      }
     }
     return Cost;
   }
@@ -8364,11 +8397,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
            InVectors.front().get<const TreeEntry *>() == &E1 &&
            InVectors.back().get<const TreeEntry *>() == E2) ||
           (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
-        assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
+        unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
+        assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
                       [](int Idx) { return Idx == PoisonMaskElem; }) &&
                "Expected all poisoned elements.");
-        ArrayRef<int> SubMask =
-            ArrayRef(Mask).slice(Part * SliceSize, SliceSize);
+        ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
         copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
         return;
       }
@@ -8688,10 +8721,11 @@ public:
                  });
         });
     SmallPtrSet<Value *, 4> UniqueBases;
-    unsigned SliceSize = VL.size() / NumParts;
-    for (unsigned Part = 0; Part < NumParts; ++Part) {
-      ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
-      for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
+    unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
+    for (unsigned Part : seq<unsigned>(NumParts)) {
+      unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
+      ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
+      for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, Limit))) {
         // Ignore non-extractelement scalars.
         if (isa<UndefValue>(V) ||
             (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
@@ -8788,7 +8822,7 @@ public:
     unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
     if (NumParts == 0 || NumParts >= Mask.size())
       NumParts = 1;
-    unsigned SliceSize = Mask.size() / NumParts;
+    unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
     const auto *It =
         find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
     unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
@@ -8805,7 +8839,7 @@ public:
     unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
     if (NumParts == 0 || NumParts >= Mask.size())
       NumParts = 1;
-    unsigned SliceSize = Mask.size() / NumParts;
+    unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
     const auto *It =
         find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
     unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
@@ -10662,12 +10696,12 @@ BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
   assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
   SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
   Mask.assign(VL.size(), PoisonMaskElem);
-  unsigned SliceSize = VL.size() / NumParts;
-  for (unsigned Part = 0; Part < NumParts; ++Part) {
+  unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
+  for (unsigned Part : seq<unsigned>(NumParts)) {
     // Scan list of gathered scalars for extractelements that can be represented
     // as shuffles.
-    MutableArrayRef<Value *> SubVL =
-        MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);
+    MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
+        Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
     SmallVector<int> SubMask;
     std::optional<TTI::ShuffleKind> Res =
         tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
@@ -11071,10 +11105,11 @@ BoUpSLP::isGatherShuffledEntry(
          "Expected only single user of the gather node.");
   assert(VL.size() % NumParts == 0 &&
          "Number of scalars must be divisible by NumParts.");
-  unsigned SliceSize = VL.size() / NumParts;
+  unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
   SmallVector<std::optional<TTI::ShuffleKind>> Res;
-  for (unsigned Part = 0; Part < NumParts; ++Part) {
-    ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
+  for (unsigned Part : seq<unsigned>(NumParts)) {
+    ArrayRef<Value *> SubVL =
+        VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
     SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
     std::optional<TTI::ShuffleKind> SubRes =
         isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
@@ -11677,11 +11712,12 @@ public:
     // into a long virtual vector register, forming the original vector.
     Value *Vec = nullptr;
     SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
-    unsigned SliceSize = E->Scalars.size() / NumParts;
-    for (unsigned Part = 0; Part < NumParts; ++Part) {
+    unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
+    for (unsigned Part : seq<unsigned>(NumParts)) {
+      unsigned Limit = getNumElems(E->Scalars.size(), SliceSize, Part);
       ArrayRef<Value *> VL =
-          ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);
-      MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
+          ArrayRef(E->Scalars).slice(Part * SliceSize, Limit);
+      MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
       constexpr int MaxBases = 2;
       SmallVector<Value *, MaxBases> Bases(MaxBases);
 #ifndef NDEBUG
@@ -11718,7 +11754,9 @@ public:
         assert((Part == 0 || all_of(seq<unsigned>(0, Part),
                                     [&](unsigned P) {
                                       ArrayRef<int> SubMask =
-                                          Mask.slice(P * SliceSize, SliceSize);
+                                          Mask.slice(P * SliceSize,
+                                                     getNumElems(Mask.size(),
+                                                                 SliceSize, P));
                                       return all_of(SubMask, [](int Idx) {
                                         return Idx == PoisonMaskElem;
                                       });
@@ -12102,13 +12140,19 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
          Idx == 0) ||
         (Mask.size() == InputVF &&
          ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
-      std::iota(std::next(Mask.begin(), I * SliceSize),
-                std::next(Mask.begin(), (I + 1) * SliceSize), 0);
+      std::iota(
+          std::next(Mask.begin(), I * SliceSize),
+          std::next(Mask.begin(),
+                    I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
+          0);
     } else {
       unsigned IVal =
           *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
-      std::fill(std::next(Mask.begin(), I * SliceSize),
-                std::next(Mask.begin(), (I + 1) * SliceSize), IVal);
+      std::fill(
+          std::next(Mask.begin(), I * SliceSize),
+          std::next(Mask.begin(),
+                    I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
+          IVal);
     }
     return true;
   };
@@ -12368,7 +12412,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
       }
     }
     if (!GatherShuffles.empty()) {
-      unsigned SliceSize = E->Scalars.size() / NumParts;
+      unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
       SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
       for (const auto [I, TEs] : enumerate(Entries)) {
         if (TEs.empty()) {
@@ -12378,7 +12422,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
         }
         assert((TEs.size() == 1 || TEs.size() == 2) &&
                "Expected shuffle of 1 or 2 entries.");
-        auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
+        unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
+        auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
         VecMask.assign(VecMask.size(), PoisonMaskElem);
         copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
         if (TEs.size() == 1) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 27f8e239b1c0..d71d7580e6ba 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -218,7 +219,7 @@ VPTransformState::VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
                                    DominatorTree *DT, IRBuilderBase &Builder,
                                    InnerLoopVectorizer *ILV, VPlan *Plan,
                                    LLVMContext &Ctx)
-    : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan),
+    : VF(VF), UF(UF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan),
       LVer(nullptr),
       TypeAnalysis(Plan->getCanonicalIV()->getScalarType(), Ctx) {}
 
@@ -436,6 +437,7 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
              "Trying to reset an existing successor block.");
       TermBr->setSuccessor(idx, NewBB);
     }
+    CFG.DTU.applyUpdates({{DominatorTree::Insert, PredBB, NewBB}});
   }
   return NewBB;
 }
@@ -467,6 +469,7 @@ void VPBasicBlock::execute(VPTransformState *State) {
     // The Exit block of a loop is always set to be successor 0 of the Exiting
     // block.
     cast<BranchInst>(ExitingBB->getTerminator())->setSuccessor(0, NewBB);
+    State->CFG.DTU.applyUpdates({{DominatorTree::Insert, ExitingBB, NewBB}});
   } else if (PrevVPBB && /* A */
              !((SingleHPred = getSingleHierarchicalPredecessor()) &&
                SingleHPred->getExitingBasicBlock() == PrevVPBB &&
@@ -829,6 +832,11 @@ void VPlan::execute(VPTransformState *State) {
   BasicBlock *VectorPreHeader = State->CFG.PrevBB;
   State->Builder.SetInsertPoint(VectorPreHeader->getTerminator());
 
+  // Disconnect VectorPreHeader from ExitBB in both the CFG and DT.
+  cast<BranchInst>(VectorPreHeader->getTerminator())->setSuccessor(0, nullptr);
+  State->CFG.DTU.applyUpdates(
+      {{DominatorTree::Delete, VectorPreHeader, State->CFG.ExitBB}});
+
   // Generate code in the loop pre-header and body.
   for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
     Block->execute(State);
@@ -891,13 +899,10 @@ void VPlan::execute(VPTransformState *State) {
     }
   }
 
-  // We do not attempt to preserve DT for outer loop vectorization currently.
-  if (!EnableVPlanNativePath) {
-    BasicBlock *VectorHeaderBB = State->CFG.VPBB2IRBB[Header];
-    State->DT->addNewBlock(VectorHeaderBB, VectorPreHeader);
-    updateDominatorTree(State->DT, VectorHeaderBB, VectorLatchBB,
-                        State->CFG.ExitBB);
-  }
+  State->CFG.DTU.flush();
+  // DT is currently updated for non-native path only.
+  assert(EnableVPlanNativePath || State->CFG.DTU.getDomTree().verify(
+                                      DominatorTree::VerificationLevel::Fast));
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -995,44 +1000,6 @@ void VPlan::addLiveOut(PHINode *PN, VPValue *V) {
   LiveOuts.insert({PN, new VPLiveOut(PN, V)});
 }
 
-void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB,
-                                BasicBlock *LoopLatchBB,
-                                BasicBlock *LoopExitBB) {
-  // The vector body may be more than a single basic-block by this point.
-  // Update the dominator tree information inside the vector body by propagating
-  // it from header to latch, expecting only triangular control-flow, if any.
-  BasicBlock *PostDomSucc = nullptr;
-  for (auto *BB = LoopHeaderBB; BB != LoopLatchBB; BB = PostDomSucc) {
-    // Get the list of successors of this block.
-    std::vector<BasicBlock *> Succs(succ_begin(BB), succ_end(BB));
-    assert(Succs.size() <= 2 &&
-           "Basic block in vector loop has more than 2 successors.");
-    PostDomSucc = Succs[0];
-    if (Succs.size() == 1) {
-      assert(PostDomSucc->getSinglePredecessor() &&
-             "PostDom successor has more than one predecessor.");
-      DT->addNewBlock(PostDomSucc, BB);
-      continue;
-    }
-    BasicBlock *InterimSucc = Succs[1];
-    if (PostDomSucc->getSingleSuccessor() == InterimSucc) {
-      PostDomSucc = Succs[1];
-      InterimSucc = Succs[0];
-    }
-    assert(InterimSucc->getSingleSuccessor() == PostDomSucc &&
-           "One successor of a basic block does not lead to the other.");
-    assert(InterimSucc->getSinglePredecessor() &&
-           "Interim successor has more than one predecessor.");
-    assert(PostDomSucc->hasNPredecessors(2) &&
-           "PostDom successor has more than two predecessors.");
-    DT->addNewBlock(InterimSucc, BB);
-    DT->addNewBlock(PostDomSucc, BB);
-  }
-  // Latch block is a new dominator for the loop exit.
-  DT->changeImmediateDominator(LoopExitBB, LoopLatchBB);
-  assert(DT->verify(DominatorTree::VerificationLevel::Fast));
-}
-
 static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
                           DenseMap<VPValue *, VPValue *> &Old2NewVPValues) {
   // Update the operands of all cloned recipes starting at NewEntry. This
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 4b3cb15b5e1e..3aee17921086 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -35,6 +35,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/ilist.h"
 #include "llvm/ADT/ilist_node.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
@@ -372,7 +373,11 @@ struct VPTransformState {
     /// of replication, maps the BasicBlock of the last replica created.
     SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;
 
-    CFGState() = default;
+    /// Updater for the DominatorTree.
+    DomTreeUpdater DTU;
+
+    CFGState(DominatorTree *DT)
+        : DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy) {}
 
     /// Returns the BasicBlock* mapped to the pre-header of the loop region
     /// containing \p R.
@@ -382,9 +387,6 @@ struct VPTransformState {
   /// Hold a pointer to LoopInfo to register new basic blocks in the loop.
   LoopInfo *LI;
 
-  /// Hold a pointer to Dominator Tree to register new basic blocks in the loop.
-  DominatorTree *DT;
-
   /// Hold a reference to the IRBuilder used to generate output IR code.
   IRBuilderBase &Builder;
 
@@ -3289,13 +3291,6 @@ public:
   /// Clone the current VPlan, update all VPValues of the new VPlan and cloned
   /// recipes to refer to the clones, and return it.
   VPlan *duplicate();
-
-private:
-  /// Add to the given dominator tree the header block and every new basic block
-  /// that was created between it and the latch block, inclusive.
-  static void updateDominatorTree(DominatorTree *DT, BasicBlock *LoopHeaderBB,
-                                  BasicBlock *LoopLatchBB,
-                                  BasicBlock *LoopExitBB);
 };
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 7ff8d8e0ea15..422579ea8b84 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1318,8 +1318,16 @@ void VPlanTransforms::addActiveLaneMask(
 /// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
 /// ...
 ///
-void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) {
+bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
   VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+  // The transform updates all users of inductions to work based on EVL, instead
+  // of the VF directly. At the moment, widened inductions cannot be updated, so
+  // bail out if the plan contains any.
+  if (any_of(Header->phis(), [](VPRecipeBase &Phi) {
+        return (isa<VPWidenIntOrFpInductionRecipe>(&Phi) ||
+                isa<VPWidenPointerInductionRecipe>(&Phi));
+      }))
+    return false;
   auto *CanonicalIVPHI = Plan.getCanonicalIV();
   VPValue *StartV = CanonicalIVPHI->getStartValue();
 
@@ -1377,6 +1385,7 @@ void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) {
   CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
   // TODO: support unroll factor > 1.
   Plan.setUF(1);
+  return true;
 }
 
 void VPlanTransforms::dropPoisonGeneratingRecipes(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 0cbc70713d9c..96b8a6639723 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -104,7 +104,8 @@ struct VPlanTransforms {
   /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe.
   /// VPCanonicalIVPHIRecipe is only used to control the loop after
   /// this transformation.
-  static void addExplicitVectorLength(VPlan &Plan);
+  /// \returns true if the transformation succeeds, or false if it doesn't.
+  static bool tryAddExplicitVectorLength(VPlan &Plan);
 };
 
 } // namespace llvm
diff --git a/llvm/test/Analysis/CostModel/AArch64/cast.ll b/llvm/test/Analysis/CostModel/AArch64/cast.ll
index 0cd444f84985..fa778864ae97 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cast.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cast.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 %s | FileCheck --check-prefixes=CHECK,CHECK-NOFP16 %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve -force-streaming-compatible-sve %s | FileCheck --check-prefixes=SVE,SVE128-NO-NEON %s
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve -force-streaming-compatible %s | FileCheck --check-prefixes=SVE,SVE128-NO-NEON %s
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16 %s | FileCheck --check-prefixes=CHECK,CHECK-FP16 %s
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=256 %s | FileCheck --check-prefixes=SVE,FIXED-MIN-256 %s
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=2048 %s | FileCheck --check-prefixes=SVE,FIXED-MIN-2048 %s
diff --git a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
index cc1532ee33dc..e1a9ee114d26 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll
@@ -13,15 +13,15 @@ define void @foo_no_vscale_range() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res.i64.v32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 true)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 true)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res.i32.v32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 true)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1(<vscale x 2 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> undef, i1 false)
@@ -33,15 +33,15 @@ define void @foo_no_vscale_range() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i64.v16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i64.v16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res.i64.v32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 false)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %res.i32.v16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %res.i32.v16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res.i32.v32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 false)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
index be5cca0765ed..a18156744a36 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
@@ -7,603 +7,1140 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji -cost-kind=code-size -S | FileCheck -check-prefixes=ALL-SIZE,VI-SIZE %s
 ; END.
 
-define amdgpu_kernel void @shufflevector_i16() {
+define amdgpu_kernel void @shufflevector_i16(<2 x i16> %vec1, <2 x i16> %vec2) {
 ; GFX9-10-LABEL: 'shufflevector_i16'
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 2>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 2>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 3>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 3>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 2>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 3>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 3>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 2>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf000 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> zeroinitializer
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 1, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 1, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 0, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 0, i32 2>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 2, i32 2>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 0, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 0, i32 2>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf222 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 2>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 2>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 2, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 2, i32 2>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 1, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 1, i32 2>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> zeroinitializer
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 3>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 3>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 3>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 3>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> zeroinitializer
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf222 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> zeroinitializer
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 3>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 3>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 3>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 3>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> zeroinitializer
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf222_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
 ; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; VI-LABEL: 'shufflevector_i16'
-; VI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf00 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
-; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf10 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf11 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 2>
-; VI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf22 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 2>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 3>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf33 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 3>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 2>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 3>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 3>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf32 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 2>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf000 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> zeroinitializer
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 1, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 1, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 0, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 0, i32 2>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 2, i32 2>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 0, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 0, i32 2>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf222 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 2>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 2>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 2, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 2, i32 2>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 1, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 1, i32 2>
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf00 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> zeroinitializer
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf10 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf11 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf22 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 3>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf33 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 3>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 3>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 3>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf32 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> zeroinitializer
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf222 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf00_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> zeroinitializer
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf10_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf11_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf22_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 3>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf33_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 3>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 3>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 3>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf32_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> zeroinitializer
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf222_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
 ; VI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-10-SIZE-LABEL: 'shufflevector_i16'
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 2>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 2>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 3>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 3>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 2>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 3>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 3>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 2>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf000 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> zeroinitializer
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 1, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 1, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 0, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 0, i32 2>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 2, i32 2>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 0, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 0, i32 2>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf222 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 2>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 2>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 2, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 2, i32 2>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 1, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 1, i32 2>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> zeroinitializer
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 3>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 3>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 3>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 3>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> zeroinitializer
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf222 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> zeroinitializer
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 3>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 3>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 3>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 3>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> zeroinitializer
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf222_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
 ; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; VI-SIZE-LABEL: 'shufflevector_i16'
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf00 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf10 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf11 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 2>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf22 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 2>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 3>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf33 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 3>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 2>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 3>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 3>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf32 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 2>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf000 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> zeroinitializer
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 1, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 1, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 0, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 0, i32 2>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 2, i32 2>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 0, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 0, i32 2>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf222 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 2>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 2>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 2, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 2, i32 2>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 1, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 1, i32 2>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf00 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> zeroinitializer
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf10 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf11 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf22 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 3>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf33 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 3>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 3>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 3>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf32 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> zeroinitializer
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf222 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf00_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> zeroinitializer
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf10_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf11_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf02_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shuf20_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf22_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf03_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 3>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf30_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf33_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 3>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf12_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf21_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf13_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 3>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf31_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 3>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf32_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> zeroinitializer
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf001_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf010_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf011_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf100_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf101_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf110_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf111_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf002_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf020_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf022_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf200_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf202_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf220_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf222_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf112_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf121_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf122_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf211_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf212_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf221_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
 ; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %shuf00 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
-  %shuf01 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 1>
-  %shuf10 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-  %shuf11 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
-  %shuf02 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 2>
-  %shuf20 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 0>
-  %shuf22 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 2>
-  %shuf03 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 0, i32 3>
-  %shuf30 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 0>
-  %shuf33 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 3>
-  %shuf12 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 2>
-  %shuf21 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 1>
-  %shuf13 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 3>
-  %shuf31 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 1>
-  %shuf23 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 2, i32 3>
-  %shuf32 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 2>
-  %shuf000 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 0, i32 0>
-  %shuf001 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 0, i32 1>
-  %shuf010 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 1, i32 0>
-  %shuf011 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 1, i32 1>
-  %shuf100 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 0, i32 0>
-  %shuf101 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 0, i32 1>
-  %shuf110 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 0>
-  %shuf111 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 1>
-  %shuf002 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 0, i32 2>
-  %shuf020 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 2, i32 0>
-  %shuf022 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 0, i32 2, i32 2>
-  %shuf200 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 0, i32 0>
-  %shuf202 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 0, i32 2>
-  %shuf220 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 0>
-  %shuf222 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 2>
-  %shuf112 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 1, i32 2>
-  %shuf121 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 2, i32 1>
-  %shuf122 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 1, i32 2, i32 2>
-  %shuf211 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 1, i32 1>
-  %shuf212 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 1, i32 2>
-  %shuf221 = shufflevector <2 x i16> undef, <2 x i16> undef, <3 x i32> <i32 2, i32 2, i32 1>
+  %shuf00 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> zeroinitializer
+  %shuf01 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 1>
+  %shuf10 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 0>
+  %shuf11 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 1>
+  %shuf02 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 2>
+  %shuf20 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 0>
+  %shuf22 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 2>
+  %shuf03 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 0, i32 3>
+  %shuf30 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 0>
+  %shuf33 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 3>
+  %shuf12 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 2>
+  %shuf21 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 1>
+  %shuf13 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 1, i32 3>
+  %shuf31 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 1>
+  %shuf23 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 2, i32 3>
+  %shuf32 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <2 x i32> <i32 3, i32 2>
+  %shuf000 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 0>
+  %shuf001 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+  %shuf010 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+  %shuf011 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+  %shuf100 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+  %shuf101 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+  %shuf110 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+  %shuf111 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+  %shuf002 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+  %shuf020 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+  %shuf022 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+  %shuf200 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+  %shuf202 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+  %shuf220 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+  %shuf222 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+  %shuf112 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+  %shuf121 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+  %shuf122 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+  %shuf211 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+  %shuf212 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+  %shuf221 = shufflevector <2 x i16> %vec1, <2 x i16> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+  %shuf00_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> zeroinitializer
+  %shuf01_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 1>
+  %shuf10_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 0>
+  %shuf11_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 1>
+  %shuf02_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 2>
+  %shuf20_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 0>
+  %shuf22_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 2>
+  %shuf03_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 0, i32 3>
+  %shuf30_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 0>
+  %shuf33_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 3>
+  %shuf12_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 2>
+  %shuf21_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 1>
+  %shuf13_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 1, i32 3>
+  %shuf31_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 1>
+  %shuf23_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 2, i32 3>
+  %shuf32_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <2 x i32> <i32 3, i32 2>
+  %shuf000_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 0, i32 0>
+  %shuf001_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+  %shuf010_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+  %shuf011_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+  %shuf100_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+  %shuf101_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+  %shuf110_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+  %shuf111_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+  %shuf002_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+  %shuf020_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+  %shuf022_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+  %shuf200_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+  %shuf202_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+  %shuf220_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+  %shuf222_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+  %shuf112_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+  %shuf121_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+  %shuf122_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+  %shuf211_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+  %shuf212_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+  %shuf221_2 = shufflevector <2 x i16> %vec1, <2 x i16> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
   ret void
 }
 
 ; Should not assert
-define amdgpu_kernel void @shufflevector_i8() {
+define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) {
 ; ALL-LABEL: 'shufflevector_i8'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf31 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf32 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> zeroinitializer
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf001 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf010 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf011 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf100 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 0, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf101 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf110 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf111 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf002 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf020 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf022 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf200 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 0, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf202 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf220 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf222 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf112 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 1, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf121 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 2, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf122 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf211 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf212 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 1, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf221 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf31 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf32 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf000 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf001 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf010 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf011 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf100 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf101 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf110 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf111 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf002 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf020 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf022 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf200 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf202 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf220 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf222 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf112 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf121 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf122 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf211 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf212 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf221 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf31_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf32_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf000_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf001_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf010_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf011_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf100_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf101_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf110_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf111_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf002_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf020_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf022_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf200_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf202_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf220_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf222_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf112_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf121_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf122_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf211_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf212_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf221_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; ALL-SIZE-LABEL: 'shufflevector_i8'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf31 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf32 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf000 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> zeroinitializer
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf001 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf010 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf011 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf100 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 0, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf101 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf110 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf111 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf002 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf020 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf022 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf200 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 0, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf202 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf220 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf222 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf112 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 1, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf121 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 2, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf122 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf211 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf212 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 1, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf221 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf31 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf32 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf000 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf001 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf010 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf011 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf100 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf101 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf110 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf111 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf002 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf020 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf022 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf200 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf202 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf220 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf222 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf112 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf121 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf122 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf211 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf212 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf221 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf31_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shuf32_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf000_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf001_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf010_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf011_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf100_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf101_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf110_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf111_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf002_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf020_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf022_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf200_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf202_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf220_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf222_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf112_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf121_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf122_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf211_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf212_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shuf221_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %shuf00 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
-  %shuf01 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 1>
-  %shuf10 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-  %shuf11 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 1>
-  %shuf02 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 2>
-  %shuf20 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 0>
-  %shuf22 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 2>
-  %shuf03 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
-  %shuf30 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 0>
-  %shuf33 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 3>
-  %shuf12 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 2>
-  %shuf21 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 1>
-  %shuf13 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 3>
-  %shuf31 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 1>
-  %shuf23 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 2, i32 3>
-  %shuf32 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 3, i32 2>
-  %shuf000 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 0, i32 0>
-  %shuf001 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 0, i32 1>
-  %shuf010 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 1, i32 0>
-  %shuf011 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 1, i32 1>
-  %shuf100 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 0, i32 0>
-  %shuf101 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 0, i32 1>
-  %shuf110 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 1, i32 0>
-  %shuf111 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 1, i32 1>
-  %shuf002 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 0, i32 2>
-  %shuf020 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 2, i32 0>
-  %shuf022 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 0, i32 2, i32 2>
-  %shuf200 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 0, i32 0>
-  %shuf202 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 0, i32 2>
-  %shuf220 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 2, i32 0>
-  %shuf222 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 2, i32 2>
-  %shuf112 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 1, i32 2>
-  %shuf121 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 2, i32 1>
-  %shuf122 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 1, i32 2, i32 2>
-  %shuf211 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 1, i32 1>
-  %shuf212 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 1, i32 2>
-  %shuf221 = shufflevector <2 x i8> undef, <2 x i8> undef, <3 x i32> <i32 2, i32 2, i32 1>
+  %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
+  %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 1>
+  %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 0>
+  %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
+  %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 2>
+  %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 0>
+  %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 2>
+  %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 3>
+  %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 0>
+  %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
+  %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 2>
+  %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 1>
+  %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 3>
+  %shuf31 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 1>
+  %shuf23 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 3>
+  %shuf32 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 2>
+  %shuf000 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 0>
+  %shuf001 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+  %shuf010 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+  %shuf011 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+  %shuf100 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+  %shuf101 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+  %shuf110 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+  %shuf111 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+  %shuf002 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+  %shuf020 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+  %shuf022 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+  %shuf200 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+  %shuf202 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+  %shuf220 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+  %shuf222 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+  %shuf112 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+  %shuf121 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+  %shuf122 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+  %shuf211 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+  %shuf212 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+  %shuf221 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+  %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer
+  %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 1>
+  %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 0>
+  %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
+  %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 2>
+  %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 0>
+  %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 2>
+  %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 3>
+  %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 0>
+  %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
+  %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 2>
+  %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 1>
+  %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 3>
+  %shuf31_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 1>
+  %shuf23_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 3>
+  %shuf32_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 2>
+  %shuf000_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 0>
+  %shuf001_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+  %shuf010_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+  %shuf011_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+  %shuf100_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+  %shuf101_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+  %shuf110_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+  %shuf111_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+  %shuf002_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+  %shuf020_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+  %shuf022_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+  %shuf200_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+  %shuf202_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+  %shuf220_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+  %shuf222_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+  %shuf112_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+  %shuf121_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+  %shuf122_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+  %shuf211_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+  %shuf212_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+  %shuf221_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
   ret void
 }
 
-define amdgpu_kernel void @shufflevector_i32() {
+define amdgpu_kernel void @shufflevector_i32(<2 x i32> %vec1, <2 x i32> %vec2) {
 ; ALL-LABEL: 'shufflevector_i32'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf02 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf20 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf03 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 0, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf30 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf12 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf21 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf13 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf31 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf000 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> zeroinitializer
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf001 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf010 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf011 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf100 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 0, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf101 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 0, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf110 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 1, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf111 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf002 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf020 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf022 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf200 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 0, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf202 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 0, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf220 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 2, i32 0>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf222 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf112 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 1, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf121 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 2, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf122 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 2, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf211 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 1, i32 1>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf212 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 1, i32 2>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf221 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf02 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf20 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf03 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 0, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf30 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf12 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf21 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf13 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf31 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf000 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf001 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf010 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf011 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf100 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf101 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf110 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf111 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf002 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf020 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf022 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf200 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf202 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf220 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf222 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf112 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf121 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf122 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf211 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf212 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf221 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf02_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf20_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf03_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 0, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf30_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf12_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf21_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf13_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf31_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf000_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> zeroinitializer
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf001_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf010_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf011_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf100_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf101_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf110_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf111_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf002_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf020_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf022_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf200_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf202_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf220_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf222_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf112_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf121_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf122_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf211_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf212_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf221_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; ALL-SIZE-LABEL: 'shufflevector_i32'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf02 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf20 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf03 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 0, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf30 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf12 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf21 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf13 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf31 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf000 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> zeroinitializer
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf001 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf010 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf011 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf100 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 0, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf101 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 0, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf110 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 1, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf111 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf002 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf020 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf022 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf200 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 0, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf202 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 0, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf220 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 2, i32 0>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf222 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf112 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 1, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf121 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 2, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf122 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 2, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf211 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 1, i32 1>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf212 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 1, i32 2>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf221 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf02 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf20 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf03 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 0, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf30 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf12 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf21 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf13 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf31 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf000 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf001 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf010 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf011 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf100 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf101 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf110 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf111 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf002 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf020 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf022 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf200 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf202 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf220 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf222 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf112 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf121 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf122 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf211 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf212 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf221 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf00_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf10_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf11_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf02_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf20_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf22_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf03_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 0, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf30_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf33_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf12_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf21_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf13_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf31_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf23_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf32_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf000_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> zeroinitializer
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf001_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf010_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf011_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf100_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf101_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf110_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf111_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf002_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf020_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf022_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf200_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf202_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf220_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf222_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf112_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf121_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf122_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf211_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf212_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %shuf221_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %shuf00 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
-  %shuf01 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 0, i32 1>
-  %shuf10 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-  %shuf11 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %shuf02 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 0, i32 2>
-  %shuf20 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 0>
-  %shuf22 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 2>
-  %shuf03 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 0, i32 3>
-  %shuf30 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 0>
-  %shuf33 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %shuf12 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 2>
-  %shuf21 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 1>
-  %shuf13 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 3>
-  %shuf31 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 1>
-  %shuf23 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %shuf32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 3, i32 2>
-  %shuf000 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 0, i32 0>
-  %shuf001 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 0, i32 1>
-  %shuf010 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 1, i32 0>
-  %shuf011 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 1, i32 1>
-  %shuf100 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 0, i32 0>
-  %shuf101 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 0, i32 1>
-  %shuf110 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 1, i32 0>
-  %shuf111 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 1, i32 1>
-  %shuf002 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 0, i32 2>
-  %shuf020 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 2, i32 0>
-  %shuf022 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 0, i32 2, i32 2>
-  %shuf200 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 0, i32 0>
-  %shuf202 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 0, i32 2>
-  %shuf220 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 2, i32 0>
-  %shuf222 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 2, i32 2>
-  %shuf112 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 1, i32 2>
-  %shuf121 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 2, i32 1>
-  %shuf122 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 1, i32 2, i32 2>
-  %shuf211 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 1, i32 1>
-  %shuf212 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 1, i32 2>
-  %shuf221 = shufflevector <2 x i32> undef, <2 x i32> undef, <3 x i32> <i32 2, i32 2, i32 1>
+  %shuf00 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> zeroinitializer
+  %shuf01 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 0, i32 1>
+  %shuf10 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 0>
+  %shuf11 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 1>
+  %shuf02 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 0, i32 2>
+  %shuf20 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 0>
+  %shuf22 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 2>
+  %shuf03 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 0, i32 3>
+  %shuf30 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 0>
+  %shuf33 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 3>
+  %shuf12 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 2>
+  %shuf21 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 1>
+  %shuf13 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 1, i32 3>
+  %shuf31 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 1>
+  %shuf23 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 2, i32 3>
+  %shuf32 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <2 x i32> <i32 3, i32 2>
+  %shuf000 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 0, i32 0>
+  %shuf001 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 0, i32 1>
+  %shuf010 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 1, i32 0>
+  %shuf011 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 1, i32 1>
+  %shuf100 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 0, i32 0>
+  %shuf101 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 0, i32 1>
+  %shuf110 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 1, i32 0>
+  %shuf111 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 1, i32 1>
+  %shuf002 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 0, i32 2>
+  %shuf020 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 2, i32 0>
+  %shuf022 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 0, i32 2, i32 2>
+  %shuf200 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 0, i32 0>
+  %shuf202 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 0, i32 2>
+  %shuf220 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 2, i32 0>
+  %shuf222 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 2, i32 2>
+  %shuf112 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 1, i32 2>
+  %shuf121 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 2, i32 1>
+  %shuf122 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 1, i32 2, i32 2>
+  %shuf211 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 1, i32 1>
+  %shuf212 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 1, i32 2>
+  %shuf221 = shufflevector <2 x i32> %vec1, <2 x i32> %vec1, <3 x i32> <i32 2, i32 2, i32 1>
+  %shuf00_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> zeroinitializer
+  %shuf01_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 0, i32 1>
+  %shuf10_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 0>
+  %shuf11_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 1>
+  %shuf02_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 0, i32 2>
+  %shuf20_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 0>
+  %shuf22_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 2>
+  %shuf03_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 0, i32 3>
+  %shuf30_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 0>
+  %shuf33_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 3>
+  %shuf12_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 2>
+  %shuf21_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 1>
+  %shuf13_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 1, i32 3>
+  %shuf31_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 1>
+  %shuf23_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 2, i32 3>
+  %shuf32_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> <i32 3, i32 2>
+  %shuf000_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 0, i32 0>
+  %shuf001_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 0, i32 1>
+  %shuf010_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 1, i32 0>
+  %shuf011_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 1, i32 1>
+  %shuf100_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 0, i32 0>
+  %shuf101_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 0, i32 1>
+  %shuf110_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 1, i32 0>
+  %shuf111_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 1, i32 1>
+  %shuf002_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 0, i32 2>
+  %shuf020_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 2, i32 0>
+  %shuf022_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 0, i32 2, i32 2>
+  %shuf200_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 0, i32 0>
+  %shuf202_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 0, i32 2>
+  %shuf220_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 2, i32 0>
+  %shuf222_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 2, i32 2>
+  %shuf112_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 1, i32 2>
+  %shuf121_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 2, i32 1>
+  %shuf122_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 1, i32 2, i32 2>
+  %shuf211_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 1, i32 1>
+  %shuf212_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 1, i32 2>
+  %shuf221_2 = shufflevector <2 x i32> %vec1, <2 x i32> %vec2, <3 x i32> <i32 2, i32 2, i32 1>
   ret void
 }
 
 ; Other shuffle cases
-define void @shuffle() {
+define void @shuffle(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> %i8v4_2, <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i8> %i8v16, <16 x i8> %i8v16_2, <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i16> %i16v8, <8 x i16> %i16v8_2, <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> %i32v4, <4 x i32> %i32v4_2, <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x float> %floatv4, <4 x float> %floatv4_2,<2 x i64> %i64v2, <2 x i64> %i64v2_2,<2 x double> %doublev2, <2 x double> %doublev2_2) {
 ; GFX9-10-LABEL: 'shuffle'
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> undef, <2 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> undef, <6 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i16_4 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v8i16_8 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4_2 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4_2 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <2 x i32> <i32 1, i32 0>
 ; GFX9-10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; VI-LABEL: 'shuffle'
-; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> undef, <2 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> undef, <6 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i16_4 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v8i16_8 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4_2 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2_2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4_2 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2_2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <2 x i32> <i32 1, i32 0>
+; VI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <2 x i32> <i32 1, i32 0>
 ; VI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; GFX9-10-SIZE-LABEL: 'shuffle'
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> undef, <2 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> undef, <6 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i16_4 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v8i16_8 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4_2 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4_2 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2_2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <2 x i32> <i32 1, i32 0>
+; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <2 x i32> <i32 1, i32 0>
 ; GFX9-10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; VI-SIZE-LABEL: 'shuffle'
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> undef, <2 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> undef, <6 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i16_4 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v8i16_8 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8_8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16_4_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_8_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i32_2_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_4_2 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_2_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2_2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_4_2 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2i64_2_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2_2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <2 x i32> <i32 1, i32 0>
+; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_2_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <2 x i32> <i32 1, i32 0>
 ; VI-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %v2i8_2 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-  %v2i8_4 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-  %v4i8_4 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-  %v2i8_8 = shufflevector <2 x i8> undef, <2 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-  %v4i8_8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-  %v6i8_8 = shufflevector <6 x i8> undef, <6 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-  %v8i8_8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-  %v16i8_16 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-  %v2i16_2 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-  %v4i16_4 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-  %v8i16_8 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
-  %v2i32_2 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-  %v4i32_4 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-  %v2f32_2 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-  %v4f32_4 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-  %v2i64_2 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-  %v2f64_2 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  %v2i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <2 x i32> <i32 1, i32 0>
+  %v2i8_2_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <2 x i32> <i32 1, i32 0>
+  %v2i8_4 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %v2i8_4_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %v4i8_4 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %v4i8_4_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %v2i8_8 = shufflevector <2 x i8>  %i8v2, <2 x i8>  %i8v2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v2i8_8_2 = shufflevector <2 x i8>  %i8v2, <2 x i8>  %i8v2_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v4i8_8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v4i8_8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v6i8_8 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v6i8_8_2 = shufflevector <6 x i8> %i8v6, <6 x i8> %i8v6_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v8i8_8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v8i8_8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v16i8_16 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v16i8_16_2 = shufflevector <16 x i8> %i8v16, <16 x i8> %i8v16_2, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v2i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <2 x i32> <i32 1, i32 0>
+  %v2i16_2_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <2 x i32> <i32 1, i32 0>
+  %v4i16_4 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %v4i16_4_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %v8i16_8 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v8i16_8_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1>
+  %v2i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <2 x i32> <i32 1, i32 0>
+  %v2i32_2_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <2 x i32> <i32 1, i32 0>
+  %v4i32_4 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %v4i32_4_2 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %v2f32_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <2 x i32> <i32 1, i32 0>
+  %v2f32_2_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2_2, <2 x i32> <i32 1, i32 0>
+  %v4f32_4 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %v4f32_4_2 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4_2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+  %v2i64_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2, <2 x i32> <i32 1, i32 0>
+  %v2i64_2_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2_2, <2 x i32> <i32 1, i32 0>
+  %v2f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <2 x i32> <i32 1, i32 0>
+  %v2f64_2_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <2 x i32> <i32 1, i32 0>
   ret void
 }
 
-define void @concat() {
+define void @concat(<2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i8> %i8v8, <8 x i8> %i8v8_2, <2 x half> %halfv2, <2 x half> %halfv2_2, <4 x half> %halfv4, <4 x half> %halfv4_2, <8 x half> %halfv8, <8 x half> %halfv8_2, <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i16> %i16v8, <8 x i16> %i16v8_2, <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> %i32v4, <4 x i32> %i32v4_2, <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x float> %floatv4, <4 x float> %floatv4_2,<2 x i64> %i64v2, <2 x i64> %i64v2_2,<2 x double> %doublev2, <2 x double> %doublev2_2) {
 ; ALL-LABEL: 'concat'
-; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32_2 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16_2 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16_2 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16_2 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_2 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; ALL-SIZE-LABEL: 'concat'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i16_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16i16_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8i32_2 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4i64_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f16_2 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f16_2 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f16_2 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_2 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-  %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v4i8 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i8 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v16i8 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v4i16 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i16 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v16i16 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v4i32 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i32 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v4i64 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v4f16 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8f16 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v16f16 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v4f32 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8f32 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v4f64 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v4i8_2 = shufflevector <2 x i8> %i8v2, <2 x i8> %i8v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i8_2 = shufflevector <4 x i8> %i8v4, <4 x i8> %i8v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v16i8_2 = shufflevector <8 x i8> %i8v8, <8 x i8> %i8v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v4i16_2 = shufflevector <2 x i16> %i16v2, <2 x i16> %i16v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i16_2 = shufflevector <4 x i16> %i16v4, <4 x i16> %i16v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v16i16_2 = shufflevector <8 x i16> %i16v8, <8 x i16> %i16v8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v4i32_2 = shufflevector <2 x i32> %i32v2, <2 x i32> %i32v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8i32_2 = shufflevector <4 x i32> %i32v4, <4 x i32> %i32v4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v4i64_2 = shufflevector <2 x i64> %i64v2, <2 x i64> %i64v2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v4f16_2 = shufflevector <2 x half> %halfv2, <2 x half> %halfv2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8f16_2 = shufflevector <4 x half> %halfv4, <4 x half> %halfv4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v16f16_2 = shufflevector <8 x half> %halfv8, <8 x half> %halfv8_2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %v4f32_2 = shufflevector <2 x float> %floatv2, <2 x float> %floatv2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v8f32_2 = shufflevector <4 x float> %floatv4, <4 x float> %floatv4_2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v4f64_2 = shufflevector <2 x double> %doublev2, <2 x double> %doublev2_2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret void
 }
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
index f333bc3fa231..809b15b20049 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -S -disable-output -passes='print<access-info>' < %s 2>&1 | FileCheck %s
 
 
@@ -7,7 +8,8 @@
 
 %int_pair = type { i32, i32 }
 
-; CHECK-LABEL: function 'backdep_type_size_equivalence':
+define void @backdep_type_size_equivalence(ptr nocapture %vec, i64 %n) {
+; CHECK-LABEL: 'backdep_type_size_equivalence'
 ; CHECK-NEXT:    loop:
 ; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 3200 bits
 ; CHECK-NEXT:      Dependences:
@@ -23,10 +25,15 @@
 ; CHECK-NEXT:            store float %val, ptr %gep.iv.min.100, align 8 ->
 ; CHECK-NEXT:            store i32 %indvars.iv.i32, ptr %gep.iv, align 8
 ; CHECK-EMPTY:
-; CHECK-NEXT:        Run-time memory checks:
-; CHECK-NEXT:        Grouped accesses:
-
-define void @backdep_type_size_equivalence(ptr nocapture %vec, i64 %n) {
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {(4 + (8 * %n) + %vec),+,8}<%loop> Added Flags: <nusw>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   br label %loop
 
@@ -72,20 +79,25 @@ exit:
 ; different store size than the i32 type, even though their alloc sizes are
 ; equivalent. This is a negative test to ensure that they are not analyzed as
 ; in the tests above.
-;
-; CHECK-LABEL: function 'backdep_type_store_size_equivalence':
+
+define void @backdep_type_store_size_equivalence(ptr nocapture %vec, i64 %n) {
+; CHECK-LABEL: 'backdep_type_store_size_equivalence'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop.
-; CHECK-NEXT:      Unknown data dependence.
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:        Unknown:
 ; CHECK-NEXT:            %ld.f32 = load float, ptr %gep.iv, align 8 ->
 ; CHECK-NEXT:            store i19 %indvars.iv.i19, ptr %gep.iv, align 8
 ; CHECK-EMPTY:
-; CHECK-NEXT:        Run-time memory checks:
-; CHECK-NEXT:        Grouped accesses:
-
-define void @backdep_type_store_size_equivalence(ptr nocapture %vec, i64 %n) {
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   br label %loop
 
@@ -114,10 +126,11 @@ exit:
 ; are done as i64 and i32 types. This is a negative test to ensure that they
 ; are not analyzed as in the tests above.
 
-; CHECK-LABEL: function 'neg_dist_dep_type_size_equivalence':
+define void @neg_dist_dep_type_size_equivalence(ptr nocapture %vec, i64 %n) {
+; CHECK-LABEL: 'neg_dist_dep_type_size_equivalence'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop.
-; CHECK-NEXT:      Backward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Backward loop carried data dependence that prevents store-to-load forwarding.
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:        BackwardVectorizableButPreventsForwarding:
 ; CHECK-NEXT:            %ld.f64 = load double, ptr %gep.iv, align 8 ->
@@ -129,12 +142,17 @@ exit:
 ; CHECK-EMPTY:
 ; CHECK-NEXT:        Unknown:
 ; CHECK-NEXT:            store double %val, ptr %gep.iv.101.i64, align 8 ->
-; CHECK-NEXT:                   store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
+; CHECK-NEXT:            store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
-
-define void @neg_dist_dep_type_size_equivalence(ptr nocapture %vec, i64 %n) {
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {((8 * %n) + %vec),+,8}<%loop> Added Flags: <nusw>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   br label %loop
 
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll
index 42d87edd8b4b..f1ae1a897fff 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes='print<access-info>' -disable-output  < %s 2>&1 | FileCheck %s
 
 ; Check that loop-indepedent forward dependences are discovered properly.
@@ -21,17 +22,31 @@
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 define void @f(ptr noalias %A, ptr noalias %B, ptr noalias %C, i64 %N) {
-
-; CHECK: Dependences:
-; CHECK-NEXT:   Forward:
-; CHECK-NEXT:       store i32 %b_p1, ptr %Aidx, align 4 ->
-; CHECK-NEXT:       %a = load i32, ptr %Aidx, align 4
-; CHECK:        ForwardButPreventsForwarding:
-; CHECK-NEXT:       store i32 %b_p2, ptr %Aidx_next, align 4 ->
-; CHECK-NEXT:       %a = load i32, ptr %Aidx, align 4
-; CHECK:        Forward:
-; CHECK-NEXT:       store i32 %b_p2, ptr %Aidx_next, align 4 ->
-; CHECK-NEXT:       store i32 %b_p1, ptr %Aidx, align 4
+; CHECK-LABEL: 'f'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            store i32 %b_p1, ptr %Aidx, align 4 ->
+; CHECK-NEXT:            %a = load i32, ptr %Aidx, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:        ForwardButPreventsForwarding:
+; CHECK-NEXT:            store i32 %b_p2, ptr %Aidx_next, align 4 ->
+; CHECK-NEXT:            %a = load i32, ptr %Aidx, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            store i32 %b_p2, ptr %Aidx_next, align 4 ->
+; CHECK-NEXT:            store i32 %b_p1, ptr %Aidx, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/pr64637.ll b/llvm/test/Analysis/LoopAccessAnalysis/pr64637.ll
index 4d4d2bf3eee8..d3e589cf99cf 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/pr64637.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/pr64637.ll
@@ -1,4 +1,5 @@
-; RUN: opt -S -passes='print<access-info>' -pass-remarks-analysis=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ANALYSIS
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes='print<access-info>' -pass-remarks-analysis=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s
 
 ; Test that LoopVectorize don't report 'Use #pragma loop distribute(enable) to allow loop distribution'
 ; when we already add #pragma clang loop distribute(enable).
@@ -17,8 +18,31 @@
 ; }
 
 define void @foo(ptr noalias nocapture noundef %y, ptr noalias nocapture noundef readnone %x, ptr noalias nocapture noundef readonly %indices, i32 noundef %n) {
-; ANALYSIS: Report: unsafe dependent memory operations in loop.
-; ANALYSIS: Backward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-LABEL: 'foo'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop.
+; CHECK-NEXT:  Backward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        BackwardVectorizableButPreventsForwarding:
+; CHECK-NEXT:            %1 = load i32, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store i32 %add8, ptr %arrayidx12, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:        BackwardVectorizable:
+; CHECK-NEXT:            store i32 %add1, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store i32 %add8, ptr %arrayidx12, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            %1 = load i32, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store i32 %add1, ptr %arrayidx, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   %cmp22 = icmp sgt i32 %n, 0
   br i1 %cmp22, label %for.body.preheader, label %for.cond.cleanup
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll b/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll
index 07e32f443554..60fe8b4fcbed 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/select-dependence.ll
@@ -1,30 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes='print<access-info>' -disable-output 2>&1 < %s | FileCheck %s
 
-; CHECK: Dependences:
-; CHECK-NEXT: Unknown:
-; CHECK-NEXT: %t63 = load double, ptr %t62, align 8 ->
-; CHECK-NEXT: store double %t63, ptr %t64, align 8
-
-define i32 @test() {
-   %a1 = alloca [128 x double], align 8
-   %a2 = alloca [128 x double], align 8
-   %a3 = alloca [128 x double], align 8
-   %t30 = getelementptr double, ptr %a2, i64 -32
+define void @test(ptr noalias %x, ptr noalias %y, ptr noalias %z) {
+; CHECK-LABEL: 'test'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %load = load double, ptr %gep.sel, align 8 ->
+; CHECK-NEXT:            store double %load, ptr %gep.sel2, align 8
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+   %gep.y = getelementptr double, ptr %y, i64 -32
    br label %loop
 
 loop:
-   %t58 = phi i64 [ %t65, %loop ], [ 0, %0 ]
-   %t59 = icmp ule i64 %t58, 32
-   %t60 = select i1 %t59, ptr %a1, ptr %t30
-   %t62 = getelementptr inbounds double, ptr %t60, i64 %t58
-   %t63 = load double, ptr %t62, align 8
-   %t61 = select i1 %t59, ptr %a2, ptr %a3
-   %t64 = getelementptr inbounds double, ptr %t61, i64 %t58
-   store double %t63, ptr %t64, align 8
-   %t65 = add nuw nsw i64 %t58, 1
-   %t66 = icmp eq i64 %t65, 94
-   br i1 %t66, label %exit, label %loop
+   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+   %icmp = icmp ule i64 %iv, 32
+   %sel = select i1 %icmp, ptr %x, ptr %gep.y
+   %gep.sel = getelementptr inbounds double, ptr %sel, i64 %iv
+   %load = load double, ptr %gep.sel, align 8
+   %sel2 = select i1 %icmp, ptr %y, ptr %z
+   %gep.sel2 = getelementptr inbounds double, ptr %sel2, i64 %iv
+   store double %load, ptr %gep.sel2, align 8
+   %iv.next = add nuw nsw i64 %iv, 1
+   %exit.cond = icmp eq i64 %iv, 94
+   br i1 %exit.cond, label %exit, label %loop
 
 exit:
-   ret i32 0
+   ret void
 }
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll b/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll
index bfdd15f170d0..ef19e173b659 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes='print<access-info>' -disable-output  < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
@@ -10,13 +11,19 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 ;     B[i] = A[i] + 1;
 ; }
 
-; CHECK: function 'nodep_Read_Write':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Memory dependences are safe
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:     Run-time memory checks:
-
 define void @nodep_Read_Write(ptr nocapture %A) {
+; CHECK-LABEL: 'nodep_Read_Write'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   %add.ptr = getelementptr inbounds i32, ptr %A, i64 1
   br label %for.body
@@ -42,17 +49,23 @@ for.body:                                         ; preds = %entry, %for.body
 ;     A[i] = i;
 ;     sum += A[i+3];
 ;   }
-;   
+;
 ;   return sum;
 ; }
 
-; CHECK: function 'nodep_Write_Read':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Memory dependences are safe
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:     Run-time memory checks:
-
 define i32 @nodep_Write_Read(ptr nocapture %A) {
+; CHECK-LABEL: 'nodep_Write_Read'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -81,13 +94,19 @@ for.body:                                         ; preds = %entry, %for.body
 ;   }
 ; }
 
-; CHECK: function 'nodep_Write_Write':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Memory dependences are safe
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:     Run-time memory checks:
-
 define void @nodep_Write_Write(ptr nocapture %A) {
+; CHECK-LABEL: 'nodep_Write_Write'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -115,16 +134,24 @@ for.body:                                         ; preds = %entry, %for.body
 ;     A[i+3] = A[i] + 1;
 ; }
 
-; CHECK: function 'unsafe_Read_Write':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop
-; CHECK-NEXT:     Backward loop carried data dependence.
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:      Backward:
-; CHECK-NEXT:           %0 = load i32, ptr %arrayidx, align 4 -> 
-; CHECK-NEXT:           store i32 %add, ptr %arrayidx3, align 4
-
 define void @unsafe_Read_Write(ptr nocapture %A) {
+; CHECK-LABEL: 'unsafe_Read_Write'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Backward loop carried data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Backward:
+; CHECK-NEXT:            %0 = load i32, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store i32 %add, ptr %arrayidx3, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -155,16 +182,24 @@ for.body:                                         ; preds = %entry, %for.body
 ;   return sum;
 ; }
 
-; CHECK: function 'unsafe_Write_Read':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop
-; CHECK-NEXT:     Backward loop carried data dependence.
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:      Backward:
-; CHECK-NEXT:           store i32 %0, ptr %arrayidx, align 4 ->
-; CHECK-NEXT:           %1 = load i32, ptr %arrayidx2, align 4
-
 define i32 @unsafe_Write_Read(ptr nocapture %A) {
+; CHECK-LABEL: 'unsafe_Write_Read'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Backward loop carried data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Backward:
+; CHECK-NEXT:            store i32 %0, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            %1 = load i32, ptr %arrayidx2, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -192,16 +227,24 @@ for.body:                                         ; preds = %entry, %for.body
 ;   }
 ; }
 
-; CHECK: function 'unsafe_Write_Write':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop
-; CHECK-NEXT:     Backward loop carried data dependence.
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:      Backward:
-; CHECK-NEXT:           store i32 %0, ptr %arrayidx, align 4 ->
-; CHECK-NEXT:           store i32 %2, ptr %arrayidx3, align 4
-
 define void @unsafe_Write_Write(ptr nocapture %A) {
+; CHECK-LABEL: 'unsafe_Write_Write'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Backward loop carried data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Backward:
+; CHECK-NEXT:            store i32 %0, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store i32 %2, ptr %arrayidx3, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -230,15 +273,23 @@ for.body:                                         ; preds = %entry, %for.body
 ;     B[i] = A[i] + 1;
 ; }
 
-; CHECK: function 'vectorizable_Read_Write':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Memory dependences are safe
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       BackwardVectorizable:
-; CHECK-NEXT:           %0 = load i32, ptr %arrayidx, align 4 ->
-; CHECK-NEXT:           store i32 %add, ptr %arrayidx2, align 4
-
 define void @vectorizable_Read_Write(ptr nocapture %A) {
+; CHECK-LABEL: 'vectorizable_Read_Write'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 64 bits
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        BackwardVectorizable:
+; CHECK-NEXT:            %0 = load i32, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store i32 %add, ptr %arrayidx2, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   %add.ptr = getelementptr inbounds i32, ptr %A, i64 4
   br label %for.body
@@ -265,19 +316,27 @@ for.body:                                         ; preds = %entry, %for.body
 ;     A[i] = i;
 ;     sum += B[i];
 ;   }
-;   
+;
 ;   return sum;
 ; }
 
-; CHECK: function 'vectorizable_Write_Read':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Memory dependences are safe
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       BackwardVectorizable:
-; CHECK-NEXT:           store i32 %0, ptr %arrayidx, align 4 ->
-; CHECK-NEXT:           %1 = load i32, ptr %arrayidx2, align 4
-
 define i32 @vectorizable_Write_Read(ptr nocapture %A) {
+; CHECK-LABEL: 'vectorizable_Write_Read'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 64 bits
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        BackwardVectorizable:
+; CHECK-NEXT:            store i32 %0, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            %1 = load i32, ptr %arrayidx2, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   %add.ptr = getelementptr inbounds i32, ptr %A, i64 4
   br label %for.body
@@ -307,15 +366,23 @@ for.body:                                         ; preds = %entry, %for.body
 ;   }
 ; }
 
-; CHECK: function 'vectorizable_Write_Write':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Memory dependences are safe
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       BackwardVectorizable:
-; CHECK-NEXT:           store i32 %0, ptr %arrayidx, align 4 -> 
-; CHECK-NEXT:           store i32 %2, ptr %arrayidx2, align 4
-
 define void @vectorizable_Write_Write(ptr nocapture %A) {
+; CHECK-LABEL: 'vectorizable_Write_Write'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 64 bits
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        BackwardVectorizable:
+; CHECK-NEXT:            store i32 %0, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store i32 %2, ptr %arrayidx2, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   %add.ptr = getelementptr inbounds i32, ptr %A, i64 4
   br label %for.body
@@ -346,16 +413,24 @@ for.body:                                         ; preds = %entry, %for.body
 ; FIXME: This case looks like previous case @vectorizable_Read_Write. It sould
 ; be vectorizable.
 
-; CHECK: function 'vectorizable_unscaled_Read_Write':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop
-; CHECK-NEXT:     Backward loop carried data dependence that prevents store-to-load forwarding.
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       BackwardVectorizableButPreventsForwarding:
-; CHECK-NEXT:           %0 = load i32, ptr %arrayidx, align 4 ->
-; CHECK-NEXT:           store i32 %add, ptr %arrayidx2, align 4
-
 define void @vectorizable_unscaled_Read_Write(ptr nocapture %A) {
+; CHECK-LABEL: 'vectorizable_unscaled_Read_Write'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Backward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        BackwardVectorizableButPreventsForwarding:
+; CHECK-NEXT:            %0 = load i32, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store i32 %add, ptr %arrayidx2, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   %add.ptr = getelementptr inbounds i8, ptr %A, i64 14
   br label %for.body
@@ -382,19 +457,27 @@ for.body:                                         ; preds = %entry, %for.body
 ;     A[i] = i;
 ;     sum += B[i];
 ;   }
-; 
+;
 ;   return sum;
 ; }
 
-; CHECK: function 'vectorizable_unscaled_Write_Read':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Memory dependences are safe
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       BackwardVectorizable:
-; CHECK-NEXT:           store i32 %0, ptr %arrayidx, align 4 -> 
-; CHECK-NEXT:           %1 = load i32, ptr %arrayidx2, align 4
-
 define i32 @vectorizable_unscaled_Write_Read(ptr nocapture %A) {
+; CHECK-LABEL: 'vectorizable_unscaled_Write_Read'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 64 bits
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        BackwardVectorizable:
+; CHECK-NEXT:            store i32 %0, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            %1 = load i32, ptr %arrayidx2, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   %add.ptr = getelementptr inbounds i8, ptr %A, i64 17
   br label %for.body
@@ -422,16 +505,24 @@ for.body:                                         ; preds = %entry, %for.body
 ;     B[i] = A[i] + 1;
 ; }
 
-; CHECK: function 'unsafe_unscaled_Read_Write':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop
-; CHECK-NEXT:     Backward loop carried data dependence.
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       Backward:
-; CHECK-NEXT:           %0 = load i32, ptr %arrayidx, align 4 -> 
-; CHECK-NEXT:           store i32 %add, ptr %arrayidx2, align 4
-
 define void @unsafe_unscaled_Read_Write(ptr nocapture %A) {
+; CHECK-LABEL: 'unsafe_unscaled_Read_Write'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Backward loop carried data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Backward:
+; CHECK-NEXT:            %0 = load i32, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store i32 %add, ptr %arrayidx2, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   %add.ptr = getelementptr inbounds i8, ptr %A, i64 11
   br label %for.body
@@ -451,15 +542,6 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 }
 
-; CHECK: function 'unsafe_unscaled_Read_Write2':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop
-; CHECK-NEXT:     Backward loop carried data dependence.
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       Backward:
-; CHECK-NEXT:           %0 = load i32, ptr %arrayidx, align 4 -> 
-; CHECK-NEXT:           store i32 %add, ptr %arrayidx2, align 4
-
 ; void unsafe_unscaled_Read_Write2(int *A) {
 ;   int *B = (int *)((char *)A + 1);
 ;   for (unsigned i = 0; i < 1024; i+=2)
@@ -467,6 +549,23 @@ for.body:                                         ; preds = %entry, %for.body
 ; }
 
 define void @unsafe_unscaled_Read_Write2(ptr nocapture %A) {
+; CHECK-LABEL: 'unsafe_unscaled_Read_Write2'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Backward loop carried data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Backward:
+; CHECK-NEXT:            %0 = load i32, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:            store i32 %add, ptr %arrayidx2, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   %add.ptr = getelementptr inbounds i8, ptr %A, i64 1
   br label %for.body
@@ -500,19 +599,28 @@ for.body:                                         ; preds = %entry, %for.body
 ;
 ; The access (2) has overlaps with (1) and (3).
 
-; CHECK: function 'interleaved_stores':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop
-; CHECK-NEXT:     Backward loop carried data dependence.
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       Backward:
-; CHECK-NEXT:           store i32 %2, ptr %arrayidx5, align 4 -> 
-; CHECK-NEXT:           store i32 %2, ptr %arrayidx9, align 4
-; CHECK:       Backward:
-; CHECK-NEXT:           store i32 %0, ptr %arrayidx2, align 4 -> 
-; CHECK-NEXT:           store i32 %2, ptr %arrayidx5, align 4
-
 define void @interleaved_stores(ptr nocapture %A) {
+; CHECK-LABEL: 'interleaved_stores'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Backward loop carried data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Backward:
+; CHECK-NEXT:            store i32 %2, ptr %arrayidx5, align 4 ->
+; CHECK-NEXT:            store i32 %2, ptr %arrayidx9, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:        Backward:
+; CHECK-NEXT:            store i32 %0, ptr %arrayidx2, align 4 ->
+; CHECK-NEXT:            store i32 %2, ptr %arrayidx5, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 entry:
   %incdec.ptr = getelementptr inbounds i8, ptr %A, i64 1
   br label %for.body
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
index 6cc045d7a681..3da0f543c5c1 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
@@ -95,6 +95,127 @@ exit:
   ret void
 }
 
+define void @single_stride_castexpr(i32 %offset, ptr %src, ptr %dst, i1 %cond) {
+; CHECK-LABEL: 'single_stride_castexpr'
+; CHECK-NEXT:    inner.loop:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP1:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.dst = getelementptr i32, ptr %dst, i64 %iv.2
+; CHECK-NEXT:        Against group ([[GRP2:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv.3
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP1]]:
+; CHECK-NEXT:          (Low: ((4 * %iv.1) + %dst) High: (804 + (4 * %iv.1) + %dst))
+; CHECK-NEXT:            Member: {((4 * %iv.1) + %dst),+,4}<%inner.loop>
+; CHECK-NEXT:        Group [[GRP2]]:
+; CHECK-NEXT:          (Low: %src High: (804 + %src))
+; CHECK-NEXT:            Member: {%src,+,4}<nuw><%inner.loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      Equal predicate: %offset == 1
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:      [PSE] %gep.dst = getelementptr i32, ptr %dst, i64 %iv.2:
+; CHECK-NEXT:        {((4 * %iv.1) + %dst),+,(4 * (sext i32 %offset to i64))<nsw>}<%inner.loop>
+; CHECK-NEXT:        --> {((4 * %iv.1) + %dst),+,4}<%inner.loop>
+; CHECK-NEXT:    outer.header:
+; CHECK-NEXT:      Report: loop is not the innermost loop
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %offset.ext = sext i32 %offset to i64
+  br label %outer.header
+
+outer.header:
+  %iv.1 = phi i64 [ 0, %entry ], [ %iv.2.next, %inner.loop ]
+  br i1 %cond, label %inner.loop, label %exit
+
+inner.loop:
+  %iv.2 = phi i64 [ %iv.1, %outer.header ], [ %iv.2.next, %inner.loop ]
+  %iv.3 = phi i32 [ 0, %outer.header ], [ %iv.3.next, %inner.loop ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv.3
+  %load = load i32, ptr %gep.src, align 8
+  %gep.dst = getelementptr i32, ptr %dst, i64 %iv.2
+  store i32 %load, ptr %gep.dst, align 8
+  %iv.2.next = add i64 %iv.2, %offset.ext
+  %iv.3.next = add i32 %iv.3, 1
+  %ec = icmp eq i32 %iv.3, 200
+  br i1 %ec, label %outer.header, label %inner.loop
+
+exit:
+  ret void
+}
+
+define void @single_stride_castexpr_multiuse(i32 %offset, ptr %src, ptr %dst, i1 %cond) {
+; CHECK-LABEL: 'single_stride_castexpr_multiuse'
+; CHECK-NEXT:    inner.loop:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP3:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.dst = getelementptr i32, ptr %dst, i64 %iv.2
+; CHECK-NEXT:        Against group ([[GRP4:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv.3
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP3]]:
+; CHECK-NEXT:          (Low: (((4 * %iv.1) + %dst) umin ((4 * %iv.1) + (4 * (sext i32 %offset to i64) * (200 + (-1 * (zext i32 %offset to i64))<nsw>)<nsw>) + %dst)) High: (4 + (((4 * %iv.1) + %dst) umax ((4 * %iv.1) + (4 * (sext i32 %offset to i64) * (200 + (-1 * (zext i32 %offset to i64))<nsw>)<nsw>) + %dst))))
+; CHECK-NEXT:            Member: {((4 * %iv.1) + %dst),+,(4 * (sext i32 %offset to i64))<nsw>}<%inner.loop>
+; CHECK-NEXT:        Group [[GRP4]]:
+; CHECK-NEXT:          (Low: ((4 * (zext i32 %offset to i64))<nuw><nsw> + %src) High: (804 + %src))
+; CHECK-NEXT:            Member: {((4 * (zext i32 %offset to i64))<nuw><nsw> + %src),+,4}<%inner.loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:    outer.header:
+; CHECK-NEXT:      Report: loop is not the innermost loop
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %offset.ext = sext i32 %offset to i64
+  %offset.zext = zext i32 %offset to i64
+  br label %outer.header
+
+outer.header:
+  %iv.1 = phi i64 [ 0, %entry ], [ %iv.2.next, %inner.loop ]
+  br i1 %cond, label %inner.loop, label %exit
+
+inner.loop:
+  %iv.2 = phi i64 [ %iv.1, %outer.header ], [ %iv.2.next, %inner.loop ]
+  %iv.3 = phi i64 [ %offset.zext, %outer.header ], [ %iv.3.next, %inner.loop ]
+  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv.3
+  %load = load i32, ptr %gep.src, align 8
+  %gep.dst = getelementptr i32, ptr %dst, i64 %iv.2
+  store i32 %load, ptr %gep.dst, align 8
+  %iv.2.next = add i64 %iv.2, %offset.ext
+  %iv.3.next = add i64 %iv.3, 1
+  %ec = icmp eq i64 %iv.3, 200
+  br i1 %ec, label %outer.header, label %inner.loop
+
+exit:
+  ret void
+}
+
 ; A loop with two symbolic strides.
 define void @two_strides(ptr noalias %A, ptr noalias %B, i64 %N, i64 %stride.1, i64 %stride.2) {
 ; CHECK-LABEL: 'two_strides'
diff --git a/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll b/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll
index 2117c779f4b3..e9faf98eee44 100644
--- a/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll
+++ b/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll
@@ -4,13 +4,14 @@
 define void @ule_from_zero(i32 %M, i32 %N) {
 ; CHECK-LABEL: 'ule_from_zero'
 ; CHECK-NEXT:  Determining loop execution counts for: @ule_from_zero
-; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
-; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %loop: <multiple exits> backedge-taken count is ((zext i32 %N to i64) umin (1 + (zext i32 %M to i64))<nuw><nsw>)
+; CHECK-NEXT:    exit count for loop: (1 + (zext i32 %M to i64))<nuw><nsw>
 ; CHECK-NEXT:    exit count for latch: %N
-; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i32 -1
-; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is %N
-; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 4294967295
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((zext i32 %N to i64) umin (1 + (zext i32 %M to i64))<nuw><nsw>)
+; CHECK-NEXT:    symbolic max exit count for loop: (1 + (zext i32 %M to i64))<nuw><nsw>
 ; CHECK-NEXT:    symbolic max exit count for latch: %N
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
 ;
 entry:
   br label %loop
@@ -61,13 +62,14 @@ exit:
 define void @ule_from_unknown(i32 %M, i32 %N, i32 %S) {
 ; CHECK-LABEL: 'ule_from_unknown'
 ; CHECK-NEXT:  Determining loop execution counts for: @ule_from_unknown
-; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
-; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %loop: <multiple exits> backedge-taken count is (((-1 * (zext i32 %S to i64))<nsw> + ((zext i32 %S to i64) umax (1 + (zext i32 %M to i64))<nuw><nsw>)) umin_seq (zext i32 ((-1 * %S) + %N) to i64))
+; CHECK-NEXT:    exit count for loop: ((-1 * (zext i32 %S to i64))<nsw> + ((zext i32 %S to i64) umax (1 + (zext i32 %M to i64))<nuw><nsw>))
 ; CHECK-NEXT:    exit count for latch: ((-1 * %S) + %N)
-; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i32 -1
-; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-1 * %S) + %N)
-; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 4294967295
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (((-1 * (zext i32 %S to i64))<nsw> + ((zext i32 %S to i64) umax (1 + (zext i32 %M to i64))<nuw><nsw>)) umin_seq (zext i32 ((-1 * %S) + %N) to i64))
+; CHECK-NEXT:    symbolic max exit count for loop: ((-1 * (zext i32 %S to i64))<nsw> + ((zext i32 %S to i64) umax (1 + (zext i32 %M to i64))<nuw><nsw>))
 ; CHECK-NEXT:    symbolic max exit count for latch: ((-1 * %S) + %N)
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
 ;
 entry:
   br label %loop
@@ -96,6 +98,9 @@ define void @ule_from_zero_no_nuw(i32 %M, i32 %N) {
 ; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is %N
 ; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    symbolic max exit count for latch: %N
+; CHECK-NEXT:  Loop %loop: Predicated backedge-taken count is ((zext i32 %N to i64) umin (1 + (zext i32 %M to i64))<nuw><nsw>)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%loop> Added Flags: <nusw>
 ;
 entry:
   br label %loop
@@ -117,13 +122,14 @@ exit:
 define void @sle_from_int_min(i32 %M, i32 %N) {
 ; CHECK-LABEL: 'sle_from_int_min'
 ; CHECK-NEXT:  Determining loop execution counts for: @sle_from_int_min
-; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
-; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %loop: <multiple exits> backedge-taken count is ((zext i32 (-2147483648 + %N) to i64) umin (2147483649 + (sext i32 %M to i64))<nsw>)
+; CHECK-NEXT:    exit count for loop: (2147483649 + (sext i32 %M to i64))<nsw>
 ; CHECK-NEXT:    exit count for latch: (-2147483648 + %N)
-; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i32 -1
-; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (-2147483648 + %N)
-; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 4294967295
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((zext i32 (-2147483648 + %N) to i64) umin (2147483649 + (sext i32 %M to i64))<nsw>)
+; CHECK-NEXT:    symbolic max exit count for loop: (2147483649 + (sext i32 %M to i64))<nsw>
 ; CHECK-NEXT:    symbolic max exit count for latch: (-2147483648 + %N)
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
 ;
 entry:
   br label %loop
@@ -174,13 +180,14 @@ exit:
 define void @sle_from_unknown(i32 %M, i32 %N, i32 %S) {
 ; CHECK-LABEL: 'sle_from_unknown'
 ; CHECK-NEXT:  Determining loop execution counts for: @sle_from_unknown
-; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
-; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %loop: <multiple exits> backedge-taken count is (((-1 * (sext i32 %S to i64))<nsw> + ((sext i32 %S to i64) smax (1 + (sext i32 %M to i64))<nsw>)) umin_seq (zext i32 ((-1 * %S) + %N) to i64))
+; CHECK-NEXT:    exit count for loop: ((-1 * (sext i32 %S to i64))<nsw> + ((sext i32 %S to i64) smax (1 + (sext i32 %M to i64))<nsw>))
 ; CHECK-NEXT:    exit count for latch: ((-1 * %S) + %N)
-; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i32 -1
-; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-1 * %S) + %N)
-; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 4294967295
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (((-1 * (sext i32 %S to i64))<nsw> + ((sext i32 %S to i64) smax (1 + (sext i32 %M to i64))<nsw>)) umin_seq (zext i32 ((-1 * %S) + %N) to i64))
+; CHECK-NEXT:    symbolic max exit count for loop: ((-1 * (sext i32 %S to i64))<nsw> + ((sext i32 %S to i64) smax (1 + (sext i32 %M to i64))<nsw>))
 ; CHECK-NEXT:    symbolic max exit count for latch: ((-1 * %S) + %N)
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
 ;
 entry:
   br label %loop
@@ -209,6 +216,9 @@ define void @sle_from_int_min_no_nsw(i32 %M, i32 %N) {
 ; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (-2147483648 + %N)
 ; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
 ; CHECK-NEXT:    symbolic max exit count for latch: (-2147483648 + %N)
+; CHECK-NEXT:  Loop %loop: Predicated backedge-taken count is ((zext i32 (-2147483648 + %N) to i64) umin (2147483649 + (sext i32 %M to i64))<nsw>)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {-2147483648,+,1}<%loop> Added Flags: <nssw>
 ;
 entry:
   br label %loop
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-integer.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-integer.mir
new file mode 100644
index 000000000000..be33f9f7b284
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-integer.mir
@@ -0,0 +1,252 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple aarch64 -run-pass=aarch64-prelegalizer-combiner %s -o - | FileCheck %s
+
+
+---
+name:   ZeroMinusAPlusB
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: ZeroMinusAPlusB
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %a:_(s32) = COPY $w0
+    ; CHECK-NEXT: %b:_(s32) = COPY $w0
+    ; CHECK-NEXT: %add:_(s32) = G_SUB %b, %a
+    ; CHECK-NEXT: $w0 = COPY %add(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %x:_(s32) = COPY $w0
+    %a:_(s32) = COPY $w0
+    %b:_(s32) = COPY $w0
+    %zero:_(s32) = G_CONSTANT i32 0
+    %sub:_(s32) = G_SUB %zero, %a
+    %add:_(s32) = G_ADD %sub, %b
+    $w0 = COPY %add
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:   ZeroMinusAPlusB_multi_use
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: ZeroMinusAPlusB_multi_use
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %a:_(s32) = COPY $w0
+    ; CHECK-NEXT: %b:_(s32) = COPY $w0
+    ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: %sub:_(s32) = G_SUB %zero, %a
+    ; CHECK-NEXT: %add:_(s32) = G_SUB %b, %a
+    ; CHECK-NEXT: $w0 = COPY %add(s32)
+    ; CHECK-NEXT: $w0 = COPY %sub(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %x:_(s32) = COPY $w0
+    %a:_(s32) = COPY $w0
+    %b:_(s32) = COPY $w0
+    %zero:_(s32) = G_CONSTANT i32 0
+    %sub:_(s32) = G_SUB %zero, %a
+    %add:_(s32) = G_ADD %sub, %b
+    $w0 = COPY %add
+    $w0 = COPY %sub
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:   APlusZeroMiunusB
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: APlusZeroMiunusB
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %a:_(s64) = COPY $x1
+    ; CHECK-NEXT: %b:_(s64) = COPY $x2
+    ; CHECK-NEXT: %add:_(s64) = G_SUB %a, %b
+    ; CHECK-NEXT: $x0 = COPY %add(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %x:_(s64) = COPY $x0
+    %a:_(s64) = COPY $x1
+    %b:_(s64) = COPY $x2
+    %zero:_(s64) = G_CONSTANT i64 0
+    %sub:_(s64) = G_SUB %zero, %b
+    %add:_(s64) = G_ADD %a, %sub
+    $x0 = COPY %add
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:   APlusBMinusB
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: APlusBMinusB
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %b:_(s64) = COPY $x1
+    ; CHECK-NEXT: $x0 = COPY %b(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %a:_(s64) = COPY $x0
+    %b:_(s64) = COPY $x1
+    %zero:_(s64) = G_CONSTANT i64 0
+    %sub:_(s64) = G_SUB %b, %a
+    %add:_(s64) = G_ADD %a, %sub
+    $x0 = COPY %add
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:   BMinusAPlusA
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: BMinusAPlusA
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %b:_(s64) = COPY $x1
+    ; CHECK-NEXT: $x0 = COPY %b(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %a:_(s64) = COPY $x0
+    %b:_(s64) = COPY $x1
+    %zero:_(s64) = G_CONSTANT i64 0
+    %sub:_(s64) = G_SUB %b, %a
+    %add:_(s64) = G_ADD %sub, %a
+    $x0 = COPY %add
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:   AMinusBPlusCMinusA
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: AMinusBPlusCMinusA
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %b:_(s64) = COPY $x1
+    ; CHECK-NEXT: %c:_(s64) = COPY $x2
+    ; CHECK-NEXT: %add:_(s64) = G_SUB %c, %b
+    ; CHECK-NEXT: $x0 = COPY %add(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %a:_(s64) = COPY $x0
+    %b:_(s64) = COPY $x1
+    %c:_(s64) = COPY $x2
+    %zero:_(s64) = G_CONSTANT i64 0
+    %sub2:_(s64) = G_SUB %c, %a
+    %sub1:_(s64) = G_SUB %a, %b
+    %add:_(s64) = G_ADD %sub1, %sub2
+    $x0 = COPY %add
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:   AMinusBPlusBMinusC
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: AMinusBPlusBMinusC
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %a:_(s64) = COPY $x0
+    ; CHECK-NEXT: %c:_(s64) = COPY $x2
+    ; CHECK-NEXT: %add:_(s64) = G_SUB %a, %c
+    ; CHECK-NEXT: $x0 = COPY %add(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %a:_(s64) = COPY $x0
+    %b:_(s64) = COPY $x1
+    %c:_(s64) = COPY $x2
+    %zero:_(s64) = G_CONSTANT i64 0
+    %sub2:_(s64) = G_SUB %b, %c
+    %sub1:_(s64) = G_SUB %a, %b
+    %add:_(s64) = G_ADD %sub1, %sub2
+    $x0 = COPY %add
+    RET_ReallyLR implicit $x0
+
+
+...
+---
+name:   APlusBMinusAplusC
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: APlusBMinusAplusC
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %b:_(s64) = COPY $x1
+    ; CHECK-NEXT: %c:_(s64) = COPY $x2
+    ; CHECK-NEXT: %add:_(s64) = G_SUB %b, %c
+    ; CHECK-NEXT: $x0 = COPY %add(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %a:_(s64) = COPY $x0
+    %b:_(s64) = COPY $x1
+    %c:_(s64) = COPY $x2
+    %zero:_(s64) = G_CONSTANT i64 0
+    %add1:_(s64) = G_ADD %a, %c
+    %sub1:_(s64) = G_SUB %b, %add1
+    %add:_(s64) = G_ADD %a, %sub1
+    $x0 = COPY %add
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:   APlusBMinusCPlusA
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: APlusBMinusCPlusA
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %b:_(s64) = COPY $x1
+    ; CHECK-NEXT: %c:_(s64) = COPY $x2
+    ; CHECK-NEXT: %add:_(s64) = G_SUB %b, %c
+    ; CHECK-NEXT: $x0 = COPY %add(s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %a:_(s64) = COPY $x0
+    %b:_(s64) = COPY $x1
+    %c:_(s64) = COPY $x2
+    %zero:_(s64) = G_CONSTANT i64 0
+    %add1:_(s64) = G_ADD %c, %a
+    %sub1:_(s64) = G_SUB %b, %add1
+    %add:_(s64) = G_ADD %a, %sub1
+    $x0 = COPY %add
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:   APlusBMinusCPlusA_BV
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: APlusBMinusCPlusA_BV
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %a1:_(s64) = COPY $x0
+    ; CHECK-NEXT: %b1:_(s64) = COPY $x1
+    ; CHECK-NEXT: %c1:_(s64) = COPY $x2
+    ; CHECK-NEXT: %b:_(<2 x s64>) = G_BUILD_VECTOR %b1(s64), %ba:_(s64)
+    ; CHECK-NEXT: %c:_(<2 x s64>) = G_BUILD_VECTOR %a1(s64), %c1(s64)
+    ; CHECK-NEXT: %add:_(<2 x s64>) = G_SUB %b, %c
+    ; CHECK-NEXT: $q0 = COPY %add(<2 x s64>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %a1:_(s64) = COPY $x0
+    %b1:_(s64) = COPY $x1
+    %c1:_(s64) = COPY $x2
+    %a:_(<2 x s64>) = G_BUILD_VECTOR %a1:_(s64), %b1:_(s64)
+    %b:_(<2 x s64>) = G_BUILD_VECTOR %b1:_(s64), %ba:_(s64)
+    %c:_(<2 x s64>) = G_BUILD_VECTOR %a1:_(s64), %c1:_(s64)
+    %zero:_(s64) = G_CONSTANT i64 0
+    %add1:_(<2 x s64>) = G_ADD %c, %a
+    %sub1:_(<2 x s64>) = G_SUB %b, %add1
+    %add:_(<2 x s64>) = G_ADD %a, %sub1
+    $q0 = COPY %add
+    RET_ReallyLR implicit $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
index 353c1550d697..074d4ecbd878 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-select.mir
@@ -117,9 +117,9 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %f
-    ; CHECK-NEXT: %sel:_(s1) = G_OR %c, [[FREEZE]]
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]]
+    ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[FREEZE]](s64)
+    ; CHECK-NEXT: %sel:_(s1) = G_OR %c, %f
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
@@ -144,9 +144,9 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %f
-    ; CHECK-NEXT: %sel:_(s1) = G_OR %c, [[FREEZE]]
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]]
+    ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[FREEZE]](s64)
+    ; CHECK-NEXT: %sel:_(s1) = G_OR %c, %f
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
@@ -172,9 +172,9 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d2
     ; CHECK-NEXT: %c:_(<2 x s1>) = G_TRUNC [[COPY]](<2 x s32>)
-    ; CHECK-NEXT: %f:_(<2 x s1>) = G_TRUNC [[COPY1]](<2 x s32>)
-    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<2 x s1>) = G_FREEZE %f
-    ; CHECK-NEXT: %sel:_(<2 x s1>) = G_OR %c, [[FREEZE]]
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<2 x s32>) = G_FREEZE [[COPY1]]
+    ; CHECK-NEXT: %f:_(<2 x s1>) = G_TRUNC [[FREEZE]](<2 x s32>)
+    ; CHECK-NEXT: %sel:_(<2 x s1>) = G_OR %c, %f
     ; CHECK-NEXT: %ext:_(<2 x s32>) = G_ANYEXT %sel(<2 x s1>)
     ; CHECK-NEXT: $d0 = COPY %ext(<2 x s32>)
     %0:_(<2 x s32>) = COPY $d0
@@ -201,9 +201,9 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
     ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %t
-    ; CHECK-NEXT: %sel:_(s1) = G_AND %c, [[FREEZE]]
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]]
+    ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[FREEZE]](s64)
+    ; CHECK-NEXT: %sel:_(s1) = G_AND %c, %t
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
@@ -229,9 +229,9 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
     ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[COPY1]](s64)
-    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %t
-    ; CHECK-NEXT: %sel:_(s1) = G_AND %c, [[FREEZE]]
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]]
+    ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[FREEZE]](s64)
+    ; CHECK-NEXT: %sel:_(s1) = G_AND %c, %t
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
@@ -257,11 +257,11 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
     ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]]
+    ; CHECK-NEXT: %t:_(s1) = G_TRUNC [[FREEZE]](s64)
     ; CHECK-NEXT: %one:_(s1) = G_CONSTANT i1 true
     ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, %one
-    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %t
-    ; CHECK-NEXT: %sel:_(s1) = G_OR [[XOR]], [[FREEZE]]
+    ; CHECK-NEXT: %sel:_(s1) = G_OR [[XOR]], %t
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
@@ -287,11 +287,11 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: %c:_(s1) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE [[COPY1]]
+    ; CHECK-NEXT: %f:_(s1) = G_TRUNC [[FREEZE]](s64)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
     ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR %c, [[C]]
-    ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE %f
-    ; CHECK-NEXT: %sel:_(s1) = G_AND [[XOR]], [[FREEZE]]
+    ; CHECK-NEXT: %sel:_(s1) = G_AND [[XOR]], %f
     ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %sel(s1)
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     %0:_(s64) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
index e754f01daa2a..a8be8bbd193a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
@@ -1379,7 +1379,7 @@ define <8 x i8> @sextmask2v8i8(<8 x i16> %src1, <8 x i8> %src2) {
 define <8 x i8> @sextmask3v8i8(<8 x i16> %src1, <8 x i8> %src2) {
 ; CHECK-LABEL: sextmask3v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sshr.8h v0, v0, #7
+; CHECK-NEXT:    ushr.8h v0, v0, #7
 ; CHECK-NEXT:    sshll.8h v1, v1, #0
 ; CHECK-NEXT:    shadd.8h v0, v0, v1
 ; CHECK-NEXT:    xtn.8b v0, v0
diff --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
index 30b5e86c1e6d..14a594e8028d 100644
--- a/llvm/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
@@ -193,11 +193,10 @@ define void @test_64bit_badmask(ptr %existing, ptr %new) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr x8, [x0]
 ; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    mov w10, #135 // =0x87
-; CHECK-NEXT:    mov w11, #664 // =0x298
-; CHECK-NEXT:    lsl w9, w9, #3
-; CHECK-NEXT:    and x8, x8, x10
-; CHECK-NEXT:    and x9, x9, x11
+; CHECK-NEXT:    mov w10, #664 // =0x298
+; CHECK-NEXT:    mov w11, #135 // =0x87
+; CHECK-NEXT:    and x9, x10, x9, lsl #3
+; CHECK-NEXT:    and x8, x8, x11
 ; CHECK-NEXT:    orr x8, x8, x9
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    ret
@@ -579,7 +578,6 @@ define <2 x i32> @test_complex_type(ptr %addr, i64 %in, ptr %bf ) {
 define i64 @test_truncated_shift(i64 %x, i64 %y) {
 ; CHECK-LABEL: test_truncated_shift:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $w1 killed $w1 killed $x1 def $x1
 ; CHECK-NEXT:    bfi x0, x1, #25, #5
 ; CHECK-NEXT:    ret
 entry:
@@ -593,7 +591,6 @@ entry:
 define i64 @test_and_extended_shift_with_imm(i64 %0) {
 ; CHECK-LABEL: test_and_extended_shift_with_imm:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0 def $x0
 ; CHECK-NEXT:    ubfiz x0, x0, #7, #8
 ; CHECK-NEXT:    ret
   %2 = shl i64 %0, 7
diff --git a/llvm/test/CodeGen/AArch64/hadd-combine.ll b/llvm/test/CodeGen/AArch64/hadd-combine.ll
index c0f76784eb37..28f454767c12 100644
--- a/llvm/test/CodeGen/AArch64/hadd-combine.ll
+++ b/llvm/test/CodeGen/AArch64/hadd-combine.ll
@@ -955,6 +955,71 @@ define <8 x i16> @urhadd_demandedelts(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %r0
 }
 
+; Remove unnecessary sign_extend_inreg after shadd
+define <2 x i32> @shadd_signbits_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) {
+; CHECK-LABEL: shadd_signbits_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #17
+; CHECK-NEXT:    sshr v1.2s, v1.2s, #17
+; CHECK-NEXT:    shadd v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+  %x0 = ashr <2 x i32> %a0, <i32 17, i32 17>
+  %x1 = ashr <2 x i32> %a1, <i32 17, i32 17>
+  %m = and <2 x i32> %x0, %x1
+  %s = xor <2 x i32> %x0, %x1
+  %x = ashr <2 x i32> %s, <i32 1, i32 1>
+  %avg = add <2 x i32> %m, %x
+  %avg1 = shl <2 x i32> %avg, <i32 17, i32 17>
+  %avg2 = ashr <2 x i32> %avg1, <i32 17, i32 17>
+  store <2 x i32> %avg, ptr %p2 ; extra use
+  ret <2 x i32> %avg2
+}
+
+; Remove unnecessary sign_extend_inreg after srhadd
+define <2 x i32> @srhadd_signbits_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) {
+; CHECK-LABEL: srhadd_signbits_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #17
+; CHECK-NEXT:    sshr v1.2s, v1.2s, #17
+; CHECK-NEXT:    srhadd v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+  %x0 = ashr <2 x i32> %a0, <i32 17, i32 17>
+  %x1 = ashr <2 x i32> %a1, <i32 17, i32 17>
+  %m = or <2 x i32> %x0, %x1
+  %s = xor <2 x i32> %x0, %x1
+  %x = ashr <2 x i32> %s, <i32 1, i32 1>
+  %avg = sub <2 x i32> %m, %x
+  %avg1 = shl <2 x i32> %avg, <i32 17, i32 17>
+  %avg2 = ashr <2 x i32> %avg1, <i32 17, i32 17>
+  store <2 x i32> %avg, ptr %p2 ; extra use
+  ret <2 x i32> %avg2
+}
+
+; negative test - not enough signbits to remove sign_extend_inreg after srhadd
+define <2 x i32> @srhadd_signbits_v2i32_negative(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) {
+; CHECK-LABEL: srhadd_signbits_v2i32_negative:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #17
+; CHECK-NEXT:    sshr v1.2s, v1.2s, #17
+; CHECK-NEXT:    srhadd v1.2s, v0.2s, v1.2s
+; CHECK-NEXT:    shl v0.2s, v1.2s, #22
+; CHECK-NEXT:    str d1, [x0]
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #22
+; CHECK-NEXT:    ret
+  %x0 = ashr <2 x i32> %a0, <i32 17, i32 17>
+  %x1 = ashr <2 x i32> %a1, <i32 17, i32 17>
+  %m = or <2 x i32> %x0, %x1
+  %s = xor <2 x i32> %x0, %x1
+  %x = ashr <2 x i32> %s, <i32 1, i32 1>
+  %avg = sub <2 x i32> %m, %x
+  %avg1 = shl <2 x i32> %avg, <i32 22, i32 22>
+  %avg2 = ashr <2 x i32> %avg1, <i32 22, i32 22>
+  store <2 x i32> %avg, ptr %p2 ; extra use
+  ret <2 x i32> %avg2
+}
+
 declare <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>)
 declare <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16>, <4 x i16>)
 declare <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32>, <2 x i32>)
@@ -979,4 +1044,4 @@ declare <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>)
 declare <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>)
 declare <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>)
 declare <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>)
-declare <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>)
-\ No newline at end of file
+declare <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
index 211237542a15..9c72afd84fa7 100644
--- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
+++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
@@ -359,6 +359,152 @@ define i32 @add_i32_ctz_nxv16i1_poison(<vscale x 16 x i1> %a, i32 %b) {
   ret i32 %add
 }
 
+; FIXED-WIDTH VECTOR TYPES
+
+define i32 @ctz_v16i1(<16 x i1> %a) {
+; CHECK-LABEL: ctz_v16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.16b, v0.16b, #7
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.b
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 0)
+  ret i32 %res
+}
+
+define i32 @ctz_v16i1_poison(<16 x i1> %a) {
+; CHECK-LABEL: ctz_v16i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.16b, v0.16b, #7
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.b
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 1)
+  ret i32 %res
+}
+
+define i64 @add_i64_ctz_v16i1_poison(<16 x i1> %a, i64 %b) {
+; CHECK-LABEL: add_i64_ctz_v16i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.16b, v0.16b, #7
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    incp x0, p0.b
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> %a, i1 1)
+  %add = add i64 %res, %b
+  ret i64 %add
+}
+
+define i32 @ctz_v8i1(<8 x i1> %a) {
+; CHECK-LABEL: ctz_v8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.8b, v0.8b, #7
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    cmlt v0.8b, v0.8b, #0
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.b
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> %a, i1 0)
+  ret i32 %res
+}
+
+define i32 @ctz_v8i1_poison(<8 x i1> %a) {
+; CHECK-LABEL: ctz_v8i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.8b, v0.8b, #7
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    cmlt v0.8b, v0.8b, #0
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.b
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> %a, i1 1)
+  ret i32 %res
+}
+
+define i32 @ctz_v4i1(<4 x i1> %a) {
+; CHECK-LABEL: ctz_v4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.4h, v0.4h, #15
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    cmlt v0.4h, v0.4h, #0
+; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.h
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> %a, i1 0)
+  ret i32 %res
+}
+
+define i32 @ctz_v4i1_poison(<4 x i1> %a) {
+; CHECK-LABEL: ctz_v4i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.4h, v0.4h, #15
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    cmlt v0.4h, v0.4h, #0
+; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.h
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> %a, i1 1)
+  ret i32 %res
+}
+
+define i32 @ctz_v2i1(<2 x i1> %a) {
+; CHECK-LABEL: ctz_v2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.2s, v0.2s, #31
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.s
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> %a, i1 0)
+  ret i32 %res
+}
+
+define i32 @ctz_v2i1_poison(<2 x i1> %a) {
+; CHECK-LABEL: ctz_v2i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.2s, v0.2s, #31
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.s
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> %a, i1 1)
+  ret i32 %res
+}
+
 declare i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1>, i1)
 declare i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1>, i1)
 declare i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1>, i1)
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index 736f66c935e7..40b8a47f92aa 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -1709,289 +1709,289 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    fmov s4, w0
 ; CHECK-NEXT:    ldr b0, [sp, #80]
 ; CHECK-NEXT:    add x8, sp, #88
-; CHECK-NEXT:    ldr b2, [sp, #144]
-; CHECK-NEXT:    fmov s4, w0
+; CHECK-NEXT:    ldr b1, [sp, #144]
 ; CHECK-NEXT:    add x10, sp, #152
-; CHECK-NEXT:    ldr b3, [sp, #16]
+; CHECK-NEXT:    ldr b6, [sp, #16]
 ; CHECK-NEXT:    ld1 { v0.b }[1], [x8]
-; CHECK-NEXT:    ld1 { v2.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #24
-; CHECK-NEXT:    ldr b1, [sp, #344]
 ; CHECK-NEXT:    add x9, sp, #96
-; CHECK-NEXT:    ld1 { v3.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #352
+; CHECK-NEXT:    ldr b2, [sp, #344]
 ; CHECK-NEXT:    mov v4.b[1], w1
+; CHECK-NEXT:    ld1 { v1.b }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #24
+; CHECK-NEXT:    ld1 { v6.b }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #352
 ; CHECK-NEXT:    add x8, sp, #104
 ; CHECK-NEXT:    ld1 { v0.b }[2], [x9]
 ; CHECK-NEXT:    add x9, sp, #160
-; CHECK-NEXT:    ld1 { v1.b }[1], [x10]
-; CHECK-NEXT:    ld1 { v2.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #32
-; CHECK-NEXT:    add x12, sp, #360
-; CHECK-NEXT:    ld1 { v3.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v2.b }[1], [x10]
+; CHECK-NEXT:    ld1 { v1.b }[2], [x9]
+; CHECK-NEXT:    add x10, sp, #32
 ; CHECK-NEXT:    add x11, sp, #112
-; CHECK-NEXT:    add x10, sp, #120
-; CHECK-NEXT:    ld1 { v1.b }[2], [x12]
-; CHECK-NEXT:    add x12, sp, #168
-; CHECK-NEXT:    ld1 { v0.b }[3], [x8]
 ; CHECK-NEXT:    mov v4.b[2], w2
-; CHECK-NEXT:    ld1 { v2.b }[3], [x12]
-; CHECK-NEXT:    add x12, sp, #40
-; CHECK-NEXT:    ld1 { v3.b }[3], [x12]
-; CHECK-NEXT:    add x13, sp, #176
-; CHECK-NEXT:    ldr b16, [sp, #216]
-; CHECK-NEXT:    ld1 { v0.b }[4], [x11]
-; CHECK-NEXT:    add x11, sp, #48
-; CHECK-NEXT:    add x12, sp, #368
-; CHECK-NEXT:    ld1 { v2.b }[4], [x13]
+; CHECK-NEXT:    ld1 { v6.b }[2], [x10]
+; CHECK-NEXT:    add x10, sp, #168
+; CHECK-NEXT:    ld1 { v0.b }[3], [x8]
+; CHECK-NEXT:    ldr b5, [sp, #216]
 ; CHECK-NEXT:    add x13, sp, #224
-; CHECK-NEXT:    add x9, sp, #128
+; CHECK-NEXT:    ld1 { v1.b }[3], [x10]
+; CHECK-NEXT:    add x10, sp, #40
+; CHECK-NEXT:    add x12, sp, #120
+; CHECK-NEXT:    ld1 { v6.b }[3], [x10]
+; CHECK-NEXT:    add x10, sp, #176
+; CHECK-NEXT:    ld1 { v5.b }[1], [x13]
 ; CHECK-NEXT:    mov v4.b[3], w3
-; CHECK-NEXT:    ld1 { v3.b }[4], [x11]
-; CHECK-NEXT:    ld1 { v16.b }[1], [x13]
-; CHECK-NEXT:    ld1 { v0.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #56
-; CHECK-NEXT:    ld1 { v1.b }[3], [x12]
-; CHECK-NEXT:    add x12, sp, #184
-; CHECK-NEXT:    ldr b5, [sp, #280]
-; CHECK-NEXT:    add x11, sp, #376
-; CHECK-NEXT:    ld1 { v3.b }[5], [x10]
-; CHECK-NEXT:    ld1 { v2.b }[5], [x12]
-; CHECK-NEXT:    add x10, sp, #232
+; CHECK-NEXT:    ld1 { v0.b }[4], [x11]
+; CHECK-NEXT:    add x11, sp, #48
+; CHECK-NEXT:    add x8, sp, #360
+; CHECK-NEXT:    ld1 { v1.b }[4], [x10]
+; CHECK-NEXT:    add x13, sp, #56
+; CHECK-NEXT:    ld1 { v6.b }[4], [x11]
+; CHECK-NEXT:    ldr b7, [sp, #280]
+; CHECK-NEXT:    ld1 { v2.b }[2], [x8]
+; CHECK-NEXT:    add x15, sp, #232
+; CHECK-NEXT:    ld1 { v0.b }[5], [x12]
+; CHECK-NEXT:    add x14, sp, #184
 ; CHECK-NEXT:    mov v4.b[4], w4
+; CHECK-NEXT:    ld1 { v5.b }[2], [x15]
+; CHECK-NEXT:    add x9, sp, #128
+; CHECK-NEXT:    ld1 { v6.b }[5], [x13]
+; CHECK-NEXT:    add x13, sp, #288
+; CHECK-NEXT:    add x10, sp, #368
+; CHECK-NEXT:    ld1 { v7.b }[1], [x13]
+; CHECK-NEXT:    ld1 { v1.b }[5], [x14]
+; CHECK-NEXT:    ld1 { v2.b }[3], [x10]
+; CHECK-NEXT:    add x15, sp, #240
 ; CHECK-NEXT:    ld1 { v0.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #288
-; CHECK-NEXT:    add x15, sp, #64
-; CHECK-NEXT:    ld1 { v16.b }[2], [x10]
-; CHECK-NEXT:    ldr b17, [sp, #408]
-; CHECK-NEXT:    ld1 { v5.b }[1], [x9]
-; CHECK-NEXT:    add x14, sp, #192
-; CHECK-NEXT:    ld1 { v1.b }[4], [x11]
-; CHECK-NEXT:    ld1 { v3.b }[6], [x15]
-; CHECK-NEXT:    add x15, sp, #416
-; CHECK-NEXT:    ld1 { v2.b }[6], [x14]
-; CHECK-NEXT:    add x14, sp, #240
-; CHECK-NEXT:    ld1 { v17.b }[1], [x15]
 ; CHECK-NEXT:    add x9, sp, #296
-; CHECK-NEXT:    add x8, sp, #136
 ; CHECK-NEXT:    mov v4.b[5], w5
-; CHECK-NEXT:    add x13, sp, #384
-; CHECK-NEXT:    ld1 { v16.b }[3], [x14]
-; CHECK-NEXT:    ld1 { v5.b }[2], [x9]
-; CHECK-NEXT:    ld1 { v1.b }[5], [x13]
-; CHECK-NEXT:    ld1 { v0.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #424
-; CHECK-NEXT:    add x9, sp, #248
-; CHECK-NEXT:    ld1 { v17.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #304
-; CHECK-NEXT:    add x10, sp, #392
-; CHECK-NEXT:    ld1 { v16.b }[4], [x9]
-; CHECK-NEXT:    ld1 { v5.b }[3], [x8]
+; CHECK-NEXT:    add x11, sp, #192
+; CHECK-NEXT:    ld1 { v5.b }[3], [x15]
+; CHECK-NEXT:    ldr b3, [sp, #408]
+; CHECK-NEXT:    ld1 { v7.b }[2], [x9]
+; CHECK-NEXT:    add x12, sp, #64
+; CHECK-NEXT:    add x13, sp, #376
+; CHECK-NEXT:    ld1 { v1.b }[6], [x11]
+; CHECK-NEXT:    add x11, sp, #416
+; CHECK-NEXT:    ld1 { v6.b }[6], [x12]
+; CHECK-NEXT:    add x12, sp, #248
+; CHECK-NEXT:    ld1 { v3.b }[1], [x11]
 ; CHECK-NEXT:    mov v4.b[6], w6
-; CHECK-NEXT:    ld1 { v1.b }[6], [x10]
-; CHECK-NEXT:    add x10, sp, #432
-; CHECK-NEXT:    add x9, sp, #256
-; CHECK-NEXT:    ld1 { v17.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #312
-; CHECK-NEXT:    ldr b22, [sp, #608]
-; CHECK-NEXT:    add x8, sp, #400
-; CHECK-NEXT:    ld1 { v16.b }[5], [x9]
-; CHECK-NEXT:    ld1 { v5.b }[4], [x10]
-; CHECK-NEXT:    add x9, sp, #616
-; CHECK-NEXT:    ld1 { v1.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #440
-; CHECK-NEXT:    ld1 { v22.b }[1], [x9]
+; CHECK-NEXT:    ld1 { v2.b }[4], [x13]
+; CHECK-NEXT:    add x11, sp, #304
+; CHECK-NEXT:    ld1 { v5.b }[4], [x12]
+; CHECK-NEXT:    ld1 { v7.b }[3], [x11]
+; CHECK-NEXT:    add x8, sp, #136
+; CHECK-NEXT:    add x15, sp, #384
+; CHECK-NEXT:    add x9, sp, #424
+; CHECK-NEXT:    ld1 { v0.b }[7], [x8]
+; CHECK-NEXT:    ld1 { v3.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v2.b }[5], [x15]
+; CHECK-NEXT:    add x8, sp, #312
 ; CHECK-NEXT:    mov v4.b[7], w7
-; CHECK-NEXT:    ld1 { v17.b }[4], [x8]
+; CHECK-NEXT:    add x9, sp, #256
+; CHECK-NEXT:    add x10, sp, #200
+; CHECK-NEXT:    ld1 { v7.b }[4], [x8]
+; CHECK-NEXT:    ld1 { v5.b }[5], [x9]
+; CHECK-NEXT:    add x14, sp, #72
+; CHECK-NEXT:    ld1 { v1.b }[7], [x10]
+; CHECK-NEXT:    add x10, sp, #432
+; CHECK-NEXT:    add x8, sp, #392
+; CHECK-NEXT:    ld1 { v6.b }[7], [x14]
+; CHECK-NEXT:    ld1 { v3.b }[3], [x10]
+; CHECK-NEXT:    ld1 { v2.b }[6], [x8]
 ; CHECK-NEXT:    add x8, sp, #320
+; CHECK-NEXT:    add x9, sp, #264
+; CHECK-NEXT:    sshll v21.8h, v4.8b, #0
+; CHECK-NEXT:    ldr b4, [sp, #208]
+; CHECK-NEXT:    ld1 { v7.b }[5], [x8]
+; CHECK-NEXT:    ld1 { v5.b }[6], [x9]
+; CHECK-NEXT:    add x10, sp, #440
+; CHECK-NEXT:    add x8, sp, #400
+; CHECK-NEXT:    sshll v16.8h, v6.8b, #0
+; CHECK-NEXT:    sshll v6.8h, v4.8b, #0
+; CHECK-NEXT:    ld1 { v3.b }[4], [x10]
+; CHECK-NEXT:    ld1 { v2.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #272
+; CHECK-NEXT:    add x9, sp, #328
+; CHECK-NEXT:    ldr b4, [sp, #608]
+; CHECK-NEXT:    ld1 { v7.b }[6], [x9]
+; CHECK-NEXT:    ld1 { v5.b }[7], [x8]
+; CHECK-NEXT:    add x8, sp, #616
 ; CHECK-NEXT:    add x10, sp, #448
-; CHECK-NEXT:    ldr b6, [sp, #208]
-; CHECK-NEXT:    ld1 { v5.b }[5], [x8]
-; CHECK-NEXT:    add x8, sp, #624
-; CHECK-NEXT:    ldr b7, [sp, #472]
-; CHECK-NEXT:    ld1 { v22.b }[2], [x8]
-; CHECK-NEXT:    ld1 { v17.b }[5], [x10]
-; CHECK-NEXT:    add x10, sp, #328
-; CHECK-NEXT:    sshll v20.8h, v4.8b, #0
-; CHECK-NEXT:    ldr b4, [sp, #480]
+; CHECK-NEXT:    ld1 { v4.b }[1], [x8]
+; CHECK-NEXT:    ldr b18, [sp, #480]
+; CHECK-NEXT:    ld1 { v3.b }[5], [x10]
+; CHECK-NEXT:    add x9, sp, #336
+; CHECK-NEXT:    ldr b17, [sp, #472]
+; CHECK-NEXT:    add x8, sp, #488
+; CHECK-NEXT:    ld1 { v7.b }[7], [x9]
+; CHECK-NEXT:    add x9, sp, #624
+; CHECK-NEXT:    ld1 { v18.b }[1], [x8]
+; CHECK-NEXT:    sshll v22.8h, v5.8b, #0
 ; CHECK-NEXT:    add x8, sp, #456
-; CHECK-NEXT:    ld1 { v5.b }[6], [x10]
-; CHECK-NEXT:    add x10, sp, #632
-; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
-; CHECK-NEXT:    ld1 { v22.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #488
-; CHECK-NEXT:    ld1 { v17.b }[6], [x8]
-; CHECK-NEXT:    add x8, sp, #336
-; CHECK-NEXT:    ld1 { v4.b }[1], [x10]
-; CHECK-NEXT:    sshll v7.8h, v7.8b, #0
-; CHECK-NEXT:    ld1 { v5.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #640
-; CHECK-NEXT:    add x9, sp, #264
-; CHECK-NEXT:    ld1 { v22.b }[4], [x8]
+; CHECK-NEXT:    sshll v5.8h, v17.8b, #0
+; CHECK-NEXT:    ld1 { v4.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v3.b }[6], [x8]
 ; CHECK-NEXT:    add x8, sp, #496
-; CHECK-NEXT:    ld1 { v16.b }[6], [x9]
-; CHECK-NEXT:    ld1 { v4.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #648
-; CHECK-NEXT:    smull v18.4s, v6.4h, v7.4h
-; CHECK-NEXT:    ldr b7, [sp, #544]
-; CHECK-NEXT:    add x9, sp, #272
-; CHECK-NEXT:    movi v6.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v22.b }[5], [x8]
+; CHECK-NEXT:    sshll v17.8h, v7.8b, #0
+; CHECK-NEXT:    add x10, sp, #632
+; CHECK-NEXT:    ld1 { v18.b }[2], [x8]
+; CHECK-NEXT:    add x9, sp, #464
 ; CHECK-NEXT:    add x8, sp, #504
-; CHECK-NEXT:    ld1 { v16.b }[7], [x9]
-; CHECK-NEXT:    ld1 { v4.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #552
-; CHECK-NEXT:    add x9, sp, #656
-; CHECK-NEXT:    ld1 { v7.b }[1], [x8]
+; CHECK-NEXT:    smull v19.4s, v6.4h, v5.4h
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v4.b }[3], [x10]
+; CHECK-NEXT:    ld1 { v3.b }[7], [x9]
+; CHECK-NEXT:    smull v6.4s, v16.4h, v17.4h
+; CHECK-NEXT:    add x9, sp, #640
+; CHECK-NEXT:    ld1 { v18.b }[3], [x8]
+; CHECK-NEXT:    smull2 v16.4s, v16.8h, v17.8h
+; CHECK-NEXT:    ldr b17, [sp, #672]
+; CHECK-NEXT:    ld1 { v4.b }[4], [x9]
+; CHECK-NEXT:    add x9, sp, #680
+; CHECK-NEXT:    ldr b20, [sp, #544]
+; CHECK-NEXT:    mov v5.s[0], v19.s[0]
 ; CHECK-NEXT:    add x8, sp, #512
-; CHECK-NEXT:    ldr b21, [sp, #672]
-; CHECK-NEXT:    ld1 { v22.b }[6], [x9]
-; CHECK-NEXT:    mov v6.s[0], v18.s[0]
-; CHECK-NEXT:    add x9, sp, #664
-; CHECK-NEXT:    ld1 { v4.b }[4], [x8]
-; CHECK-NEXT:    add x8, sp, #560
-; CHECK-NEXT:    sshll v23.8h, v16.8b, #0
-; CHECK-NEXT:    ld1 { v7.b }[2], [x8]
-; CHECK-NEXT:    add x8, sp, #520
-; CHECK-NEXT:    movi v19.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v22.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #528
-; CHECK-NEXT:    add x10, sp, #464
-; CHECK-NEXT:    ld1 { v4.b }[5], [x8]
-; CHECK-NEXT:    add x8, sp, #568
-; CHECK-NEXT:    smull2 v18.4s, v20.8h, v23.8h
-; CHECK-NEXT:    ld1 { v7.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #680
-; CHECK-NEXT:    smlal v6.4s, v20.4h, v23.4h
-; CHECK-NEXT:    ld1 { v21.b }[1], [x8]
-; CHECK-NEXT:    sshll v20.8h, v22.8b, #0
-; CHECK-NEXT:    ldr b22, [sp, #736]
-; CHECK-NEXT:    ld1 { v4.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #576
-; CHECK-NEXT:    ldr b23, [sp, #1000]
-; CHECK-NEXT:    ld1 { v7.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #688
-; CHECK-NEXT:    sshll v24.8h, v22.8b, #0
-; CHECK-NEXT:    ld1 { v21.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v17.b }[1], [x9]
+; CHECK-NEXT:    add x11, sp, #552
+; CHECK-NEXT:    add x10, sp, #648
+; CHECK-NEXT:    ld1 { v18.b }[4], [x8]
+; CHECK-NEXT:    ld1 { v20.b }[1], [x11]
+; CHECK-NEXT:    ld1 { v4.b }[5], [x10]
+; CHECK-NEXT:    add x10, sp, #688
+; CHECK-NEXT:    add x9, sp, #520
+; CHECK-NEXT:    ld1 { v17.b }[2], [x10]
+; CHECK-NEXT:    add x10, sp, #560
+; CHECK-NEXT:    smull2 v7.4s, v21.8h, v22.8h
+; CHECK-NEXT:    ld1 { v18.b }[5], [x9]
+; CHECK-NEXT:    smlal v5.4s, v21.4h, v22.4h
+; CHECK-NEXT:    ld1 { v20.b }[2], [x10]
+; CHECK-NEXT:    ldr b21, [sp, #736]
+; CHECK-NEXT:    ldr b22, [sp, #1000]
+; CHECK-NEXT:    add x8, sp, #656
 ; CHECK-NEXT:    add x9, sp, #696
-; CHECK-NEXT:    sshll v25.8h, v23.8b, #0
-; CHECK-NEXT:    add x8, sp, #536
-; CHECK-NEXT:    ldr b22, [sp, #872]
-; CHECK-NEXT:    ldr b23, [sp, #936]
-; CHECK-NEXT:    ld1 { v4.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #584
-; CHECK-NEXT:    ld1 { v17.b }[7], [x10]
-; CHECK-NEXT:    ld1 { v21.b }[3], [x9]
-; CHECK-NEXT:    ld1 { v7.b }[5], [x8]
-; CHECK-NEXT:    add x8, sp, #880
-; CHECK-NEXT:    add x9, sp, #704
-; CHECK-NEXT:    smull v25.4s, v24.4h, v25.4h
-; CHECK-NEXT:    ldr b24, [sp, #744]
-; CHECK-NEXT:    ld1 { v22.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #944
-; CHECK-NEXT:    add x10, sp, #888
-; CHECK-NEXT:    ld1 { v21.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #752
-; CHECK-NEXT:    ld1 { v23.b }[1], [x8]
-; CHECK-NEXT:    ld1 { v24.b }[1], [x9]
-; CHECK-NEXT:    add x8, sp, #712
+; CHECK-NEXT:    add x11, sp, #568
+; CHECK-NEXT:    ld1 { v4.b }[6], [x8]
+; CHECK-NEXT:    add x8, sp, #528
+; CHECK-NEXT:    ld1 { v17.b }[3], [x9]
+; CHECK-NEXT:    sshll v21.8h, v21.8b, #0
+; CHECK-NEXT:    sshll v24.8h, v22.8b, #0
+; CHECK-NEXT:    ld1 { v18.b }[6], [x8]
+; CHECK-NEXT:    ld1 { v20.b }[3], [x11]
+; CHECK-NEXT:    add x10, sp, #704
+; CHECK-NEXT:    ldr b23, [sp, #808]
+; CHECK-NEXT:    movi v19.2d, #0000000000000000
+; CHECK-NEXT:    add x9, sp, #536
+; CHECK-NEXT:    ld1 { v17.b }[4], [x10]
+; CHECK-NEXT:    add x10, sp, #576
+; CHECK-NEXT:    ldr b22, [sp, #744]
+; CHECK-NEXT:    add x11, sp, #816
+; CHECK-NEXT:    smull v24.4s, v21.4h, v24.4h
+; CHECK-NEXT:    ld1 { v18.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v20.b }[4], [x10]
+; CHECK-NEXT:    add x10, sp, #752
+; CHECK-NEXT:    ld1 { v23.b }[1], [x11]
+; CHECK-NEXT:    add x9, sp, #712
+; CHECK-NEXT:    ld1 { v22.b }[1], [x10]
+; CHECK-NEXT:    ld1 { v17.b }[5], [x9]
+; CHECK-NEXT:    add x9, sp, #584
+; CHECK-NEXT:    add x10, sp, #824
+; CHECK-NEXT:    sshll v21.8h, v18.8b, #0
+; CHECK-NEXT:    ld1 { v20.b }[5], [x9]
 ; CHECK-NEXT:    add x9, sp, #760
-; CHECK-NEXT:    ld1 { v22.b }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #952
-; CHECK-NEXT:    mov v19.s[0], v25.s[0]
-; CHECK-NEXT:    ldr b25, [sp, #808]
+; CHECK-NEXT:    ldr b18, [sp, #936]
 ; CHECK-NEXT:    ld1 { v23.b }[2], [x10]
-; CHECK-NEXT:    ld1 { v21.b }[5], [x8]
-; CHECK-NEXT:    ld1 { v24.b }[2], [x9]
-; CHECK-NEXT:    add x8, sp, #816
-; CHECK-NEXT:    add x9, sp, #896
-; CHECK-NEXT:    ld1 { v25.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #960
-; CHECK-NEXT:    ld1 { v22.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #768
-; CHECK-NEXT:    ld1 { v23.b }[3], [x8]
-; CHECK-NEXT:    add x10, sp, #904
-; CHECK-NEXT:    ld1 { v24.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #824
-; CHECK-NEXT:    add x8, sp, #720
-; CHECK-NEXT:    ld1 { v25.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #968
-; CHECK-NEXT:    ld1 { v22.b }[4], [x10]
-; CHECK-NEXT:    add x10, sp, #776
-; CHECK-NEXT:    ld1 { v23.b }[4], [x9]
-; CHECK-NEXT:    ld1 { v21.b }[6], [x8]
-; CHECK-NEXT:    ld1 { v24.b }[4], [x10]
-; CHECK-NEXT:    add x8, sp, #832
-; CHECK-NEXT:    add x9, sp, #912
-; CHECK-NEXT:    ld1 { v25.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #976
-; CHECK-NEXT:    ld1 { v22.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #784
-; CHECK-NEXT:    ld1 { v23.b }[5], [x8]
-; CHECK-NEXT:    add x10, sp, #920
-; CHECK-NEXT:    ld1 { v24.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #840
-; CHECK-NEXT:    add x8, sp, #728
-; CHECK-NEXT:    ld1 { v25.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #984
-; CHECK-NEXT:    ld1 { v22.b }[6], [x10]
-; CHECK-NEXT:    add x10, sp, #792
-; CHECK-NEXT:    ld1 { v23.b }[6], [x9]
-; CHECK-NEXT:    ld1 { v21.b }[7], [x8]
-; CHECK-NEXT:    ld1 { v24.b }[6], [x10]
-; CHECK-NEXT:    add x8, sp, #848
-; CHECK-NEXT:    add x9, sp, #928
-; CHECK-NEXT:    ld1 { v25.b }[5], [x8]
-; CHECK-NEXT:    add x12, sp, #72
-; CHECK-NEXT:    add x8, sp, #992
-; CHECK-NEXT:    ld1 { v22.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #800
-; CHECK-NEXT:    ld1 { v3.b }[7], [x12]
-; CHECK-NEXT:    ld1 { v23.b }[7], [x8]
-; CHECK-NEXT:    add x8, sp, #592
-; CHECK-NEXT:    ld1 { v24.b }[7], [x9]
-; CHECK-NEXT:    add x9, sp, #856
-; CHECK-NEXT:    ld1 { v7.b }[6], [x8]
-; CHECK-NEXT:    add x11, sp, #200
-; CHECK-NEXT:    ld1 { v25.b }[6], [x9]
-; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-NEXT:    sshll v5.8h, v5.8b, #0
-; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
-; CHECK-NEXT:    sshll v21.8h, v21.8b, #0
+; CHECK-NEXT:    mov v19.s[0], v24.s[0]
+; CHECK-NEXT:    ldr b24, [sp, #872]
+; CHECK-NEXT:    ld1 { v22.b }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #944
+; CHECK-NEXT:    add x11, sp, #880
+; CHECK-NEXT:    add x10, sp, #768
+; CHECK-NEXT:    ld1 { v18.b }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #832
+; CHECK-NEXT:    ld1 { v24.b }[1], [x11]
+; CHECK-NEXT:    ld1 { v23.b }[3], [x9]
+; CHECK-NEXT:    ld1 { v22.b }[3], [x10]
+; CHECK-NEXT:    add x10, sp, #952
+; CHECK-NEXT:    add x12, sp, #888
+; CHECK-NEXT:    add x9, sp, #592
+; CHECK-NEXT:    add x11, sp, #776
+; CHECK-NEXT:    ld1 { v18.b }[2], [x10]
+; CHECK-NEXT:    add x10, sp, #840
+; CHECK-NEXT:    ld1 { v24.b }[2], [x12]
+; CHECK-NEXT:    ld1 { v23.b }[4], [x10]
+; CHECK-NEXT:    ld1 { v22.b }[4], [x11]
+; CHECK-NEXT:    ld1 { v20.b }[6], [x9]
+; CHECK-NEXT:    add x9, sp, #960
+; CHECK-NEXT:    add x11, sp, #896
+; CHECK-NEXT:    add x10, sp, #784
+; CHECK-NEXT:    ld1 { v18.b }[3], [x9]
+; CHECK-NEXT:    add x9, sp, #848
+; CHECK-NEXT:    ld1 { v24.b }[3], [x11]
+; CHECK-NEXT:    ld1 { v23.b }[5], [x9]
+; CHECK-NEXT:    ld1 { v22.b }[5], [x10]
+; CHECK-NEXT:    add x10, sp, #968
+; CHECK-NEXT:    add x12, sp, #904
+; CHECK-NEXT:    add x9, sp, #600
+; CHECK-NEXT:    add x11, sp, #792
+; CHECK-NEXT:    ld1 { v18.b }[4], [x10]
+; CHECK-NEXT:    add x10, sp, #856
+; CHECK-NEXT:    ld1 { v24.b }[4], [x12]
+; CHECK-NEXT:    ld1 { v23.b }[6], [x10]
+; CHECK-NEXT:    ld1 { v22.b }[6], [x11]
+; CHECK-NEXT:    ld1 { v20.b }[7], [x9]
+; CHECK-NEXT:    add x9, sp, #976
+; CHECK-NEXT:    add x11, sp, #912
+; CHECK-NEXT:    add x10, sp, #800
+; CHECK-NEXT:    ld1 { v18.b }[5], [x9]
+; CHECK-NEXT:    add x9, sp, #864
+; CHECK-NEXT:    ld1 { v24.b }[5], [x11]
+; CHECK-NEXT:    ld1 { v23.b }[7], [x9]
+; CHECK-NEXT:    add x9, sp, #720
+; CHECK-NEXT:    ld1 { v22.b }[7], [x10]
+; CHECK-NEXT:    add x10, sp, #984
+; CHECK-NEXT:    ld1 { v17.b }[6], [x9]
+; CHECK-NEXT:    add x9, sp, #920
+; CHECK-NEXT:    ld1 { v18.b }[6], [x10]
+; CHECK-NEXT:    ld1 { v24.b }[6], [x9]
+; CHECK-NEXT:    add x10, sp, #728
+; CHECK-NEXT:    add x8, sp, #664
+; CHECK-NEXT:    sshll v20.8h, v20.8b, #0
 ; CHECK-NEXT:    sshll v22.8h, v22.8b, #0
 ; CHECK-NEXT:    sshll v23.8h, v23.8b, #0
-; CHECK-NEXT:    add x8, sp, #600
-; CHECK-NEXT:    sshll v24.8h, v24.8b, #0
-; CHECK-NEXT:    add x9, sp, #864
-; CHECK-NEXT:    ld1 { v2.b }[7], [x11]
-; CHECK-NEXT:    ld1 { v7.b }[7], [x8]
-; CHECK-NEXT:    ld1 { v25.b }[7], [x9]
-; CHECK-NEXT:    smull v16.4s, v3.4h, v5.4h
-; CHECK-NEXT:    smull2 v3.4s, v3.8h, v5.8h
-; CHECK-NEXT:    smull v5.4s, v21.4h, v23.4h
-; CHECK-NEXT:    smull2 v21.4s, v21.8h, v23.8h
-; CHECK-NEXT:    smull2 v23.4s, v20.8h, v22.8h
-; CHECK-NEXT:    smlal v19.4s, v4.4h, v24.4h
-; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
-; CHECK-NEXT:    sshll v17.8h, v17.8b, #0
+; CHECK-NEXT:    add x9, sp, #992
+; CHECK-NEXT:    ld1 { v17.b }[7], [x10]
+; CHECK-NEXT:    add x10, sp, #928
+; CHECK-NEXT:    ld1 { v18.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v4.b }[7], [x8]
+; CHECK-NEXT:    ld1 { v24.b }[7], [x10]
+; CHECK-NEXT:    smlal v19.4s, v21.4h, v22.4h
+; CHECK-NEXT:    smull2 v21.4s, v21.8h, v22.8h
+; CHECK-NEXT:    smull v22.4s, v20.4h, v23.4h
+; CHECK-NEXT:    smull2 v20.4s, v20.8h, v23.8h
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    sshll v7.8h, v7.8b, #0
-; CHECK-NEXT:    sshll v25.8h, v25.8b, #0
-; CHECK-NEXT:    smlal2 v3.4s, v2.8h, v17.8h
-; CHECK-NEXT:    smlal v16.4s, v2.4h, v17.4h
-; CHECK-NEXT:    smlal2 v23.4s, v4.8h, v24.8h
-; CHECK-NEXT:    smlal2 v18.4s, v0.8h, v1.8h
-; CHECK-NEXT:    smlal v6.4s, v0.4h, v1.4h
-; CHECK-NEXT:    smlal v19.4s, v20.4h, v22.4h
-; CHECK-NEXT:    smlal2 v21.4s, v7.8h, v25.8h
-; CHECK-NEXT:    smlal v5.4s, v7.4h, v25.4h
-; CHECK-NEXT:    add v0.4s, v18.4s, v3.4s
-; CHECK-NEXT:    add v1.4s, v6.4s, v16.4s
-; CHECK-NEXT:    add v2.4s, v23.4s, v21.4s
-; CHECK-NEXT:    add v3.4s, v19.4s, v5.4s
+; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
+; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-NEXT:    sshll v17.8h, v17.8b, #0
+; CHECK-NEXT:    sshll v18.8h, v18.8b, #0
+; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
+; CHECK-NEXT:    sshll v23.8h, v24.8b, #0
+; CHECK-NEXT:    smlal2 v16.4s, v1.8h, v3.8h
+; CHECK-NEXT:    smlal v6.4s, v1.4h, v3.4h
+; CHECK-NEXT:    smlal2 v7.4s, v0.8h, v2.8h
+; CHECK-NEXT:    smlal v5.4s, v0.4h, v2.4h
+; CHECK-NEXT:    smlal2 v20.4s, v17.8h, v18.8h
+; CHECK-NEXT:    smlal v22.4s, v17.4h, v18.4h
+; CHECK-NEXT:    smlal2 v21.4s, v4.8h, v23.8h
+; CHECK-NEXT:    smlal v19.4s, v4.4h, v23.4h
+; CHECK-NEXT:    add v0.4s, v7.4s, v16.4s
+; CHECK-NEXT:    add v1.4s, v5.4s, v6.4s
+; CHECK-NEXT:    add v2.4s, v21.4s, v20.4s
+; CHECK-NEXT:    add v3.4s, v19.4s, v22.4s
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    add v1.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
@@ -2050,10 +2050,10 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    ld1 { v3.b }[2], [x10]
 ; CHECK-NEXT:    ld1 { v5.b }[2], [x8]
 ; CHECK-NEXT:    add x8, sp, #176
-; CHECK-NEXT:    ldr b6, [sp, #544]
+; CHECK-NEXT:    ldr b6, [sp, #672]
 ; CHECK-NEXT:    ld1 { v0.b }[4], [x12]
-; CHECK-NEXT:    add x14, sp, #552
-; CHECK-NEXT:    ldr b7, [sp, #672]
+; CHECK-NEXT:    add x14, sp, #680
+; CHECK-NEXT:    ldr b7, [sp, #544]
 ; CHECK-NEXT:    ld1 { v2.b }[4], [x8]
 ; CHECK-NEXT:    add x13, sp, #40
 ; CHECK-NEXT:    ld1 { v6.b }[1], [x14]
@@ -2061,7 +2061,7 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    add x11, sp, #128
 ; CHECK-NEXT:    ld1 { v3.b }[3], [x13]
 ; CHECK-NEXT:    ld1 { v0.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #680
+; CHECK-NEXT:    add x9, sp, #552
 ; CHECK-NEXT:    add x13, sp, #184
 ; CHECK-NEXT:    ld1 { v7.b }[1], [x9]
 ; CHECK-NEXT:    ld1 { v2.b }[5], [x13]
@@ -2070,26 +2070,26 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    ld1 { v4.b }[2], [x13]
 ; CHECK-NEXT:    add x10, sp, #136
 ; CHECK-NEXT:    ld1 { v0.b }[6], [x11]
-; CHECK-NEXT:    add x11, sp, #560
+; CHECK-NEXT:    add x11, sp, #688
 ; CHECK-NEXT:    ld1 { v5.b }[3], [x15]
 ; CHECK-NEXT:    ld1 { v6.b }[2], [x11]
-; CHECK-NEXT:    add x11, sp, #688
+; CHECK-NEXT:    add x11, sp, #560
 ; CHECK-NEXT:    mov v1.b[3], w3
 ; CHECK-NEXT:    ld1 { v7.b }[2], [x11]
 ; CHECK-NEXT:    add x9, sp, #632
 ; CHECK-NEXT:    add x11, sp, #512
 ; CHECK-NEXT:    ld1 { v0.b }[7], [x10]
 ; CHECK-NEXT:    ld1 { v4.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #568
-; CHECK-NEXT:    add x10, sp, #696
+; CHECK-NEXT:    add x9, sp, #696
+; CHECK-NEXT:    add x10, sp, #568
 ; CHECK-NEXT:    ld1 { v6.b }[3], [x9]
 ; CHECK-NEXT:    ld1 { v5.b }[4], [x11]
 ; CHECK-NEXT:    ld1 { v7.b }[3], [x10]
 ; CHECK-NEXT:    add x9, sp, #640
 ; CHECK-NEXT:    mov v1.b[4], w4
 ; CHECK-NEXT:    ld1 { v4.b }[4], [x9]
-; CHECK-NEXT:    add x9, sp, #576
-; CHECK-NEXT:    add x10, sp, #704
+; CHECK-NEXT:    add x9, sp, #704
+; CHECK-NEXT:    add x10, sp, #576
 ; CHECK-NEXT:    add x11, sp, #520
 ; CHECK-NEXT:    ld1 { v6.b }[4], [x9]
 ; CHECK-NEXT:    ldr b18, [sp, #736]
@@ -2101,8 +2101,8 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    add x9, sp, #648
 ; CHECK-NEXT:    ld1 { v3.b }[4], [x8]
 ; CHECK-NEXT:    add x10, sp, #528
-; CHECK-NEXT:    add x11, sp, #584
-; CHECK-NEXT:    add x12, sp, #712
+; CHECK-NEXT:    add x11, sp, #712
+; CHECK-NEXT:    add x12, sp, #584
 ; CHECK-NEXT:    sshll v18.8h, v18.8b, #0
 ; CHECK-NEXT:    mov v1.b[5], w5
 ; CHECK-NEXT:    ld1 { v6.b }[5], [x11]
@@ -2114,8 +2114,8 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    ld1 { v3.b }[5], [x14]
 ; CHECK-NEXT:    add x9, sp, #656
 ; CHECK-NEXT:    add x10, sp, #536
-; CHECK-NEXT:    add x11, sp, #592
-; CHECK-NEXT:    add x12, sp, #720
+; CHECK-NEXT:    add x11, sp, #720
+; CHECK-NEXT:    add x12, sp, #592
 ; CHECK-NEXT:    sshll v18.4s, v18.4h, #0
 ; CHECK-NEXT:    ldr b16, [sp, #208]
 ; CHECK-NEXT:    ld1 { v6.b }[6], [x11]
@@ -2127,8 +2127,8 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    sshll v16.8h, v16.8b, #0
 ; CHECK-NEXT:    ld1 { v3.b }[6], [x8]
 ; CHECK-NEXT:    add x8, sp, #664
-; CHECK-NEXT:    add x9, sp, #600
-; CHECK-NEXT:    add x10, sp, #728
+; CHECK-NEXT:    add x9, sp, #728
+; CHECK-NEXT:    add x10, sp, #600
 ; CHECK-NEXT:    mov v17.s[0], v18.s[0]
 ; CHECK-NEXT:    ld1 { v6.b }[7], [x9]
 ; CHECK-NEXT:    ld1 { v7.b }[7], [x10]
@@ -2151,7 +2151,7 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
 ; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
 ; CHECK-NEXT:    saddl2 v16.4s, v7.8h, v6.8h
-; CHECK-NEXT:    saddl2 v5.4s, v4.8h, v5.8h
+; CHECK-NEXT:    saddl2 v5.4s, v5.8h, v4.8h
 ; CHECK-NEXT:    saddl v6.4s, v7.4h, v6.4h
 ; CHECK-NEXT:    saddw v4.4s, v17.4s, v4.4h
 ; CHECK-NEXT:    saddl2 v17.4s, v1.8h, v0.8h
diff --git a/llvm/test/CodeGen/AArch64/pr58431.ll b/llvm/test/CodeGen/AArch64/pr58431.ll
index dcd97597ae40..e87d8f7874d6 100644
--- a/llvm/test/CodeGen/AArch64/pr58431.ll
+++ b/llvm/test/CodeGen/AArch64/pr58431.ll
@@ -4,8 +4,8 @@
 define i32 @f(i64 %0) {
 ; CHECK-LABEL: f:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #10
-; CHECK-NEXT:    mov w9, w0
+; CHECK-NEXT:    mov w8, #10 // =0xa
+; CHECK-NEXT:    and x9, x0, #0xffffffff
 ; CHECK-NEXT:    udiv x10, x9, x8
 ; CHECK-NEXT:    msub x0, x10, x8, x9
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
diff --git a/llvm/test/CodeGen/AArch64/selectopt-not.ll b/llvm/test/CodeGen/AArch64/selectopt-not.ll
index 7a949d11c80d..a7939d651a2c 100644
--- a/llvm/test/CodeGen/AArch64/selectopt-not.ll
+++ b/llvm/test/CodeGen/AArch64/selectopt-not.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -S < %s | FileCheck %s
+; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-STANDARD
+; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -S -disable-loop-level-heuristics < %s | FileCheck %s --check-prefixes=CHECK,CHECK-FORCED
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
 target triple = "aarch64"
@@ -29,10 +30,10 @@ define i32 @minloc1(ptr nocapture readonly %0, ptr nocapture readonly %1, ptr no
 ; CHECK-NEXT:    [[TMP21:%.*]] = sub i64 0, [[TMP7]]
 ; CHECK-NEXT:    br label [[DOTPREHEADER35:%.*]]
 ; CHECK:       .preheader35:
-; CHECK-NEXT:    [[TMP22:%.*]] = phi i32 [ 2147483647, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP30:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[TMP23:%.*]] = phi i64 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[IV_N:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[DOT045:%.*]] = phi i1 [ false, [[DOTPREHEADER35_LR_PH]] ], [ [[DOT2:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[DOTLCSSA364144:%.*]] = phi i32 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP29:%.*]], [[DOTPREHEADER35]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi i32 [ 2147483647, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP30:%.*]], [[SELECT_END:%.*]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = phi i64 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[IV_N:%.*]], [[SELECT_END]] ]
+; CHECK-NEXT:    [[DOT045:%.*]] = phi i1 [ false, [[DOTPREHEADER35_LR_PH]] ], [ [[DOT2:%.*]], [[SELECT_END]] ]
+; CHECK-NEXT:    [[DOTLCSSA364144:%.*]] = phi i32 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP29:%.*]], [[SELECT_END]] ]
 ; CHECK-NEXT:    [[TMP24:%.*]] = mul nsw i64 [[TMP23]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP24]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
@@ -40,15 +41,20 @@ define i32 @minloc1(ptr nocapture readonly %0, ptr nocapture readonly %1, ptr no
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp sge i32 [[TMP26]], [[TMP22]]
 ; CHECK-NEXT:    [[DOTNOT33:%.*]] = and i1 [[DOT045]], [[TMP28]]
 ; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[TMP27]], i1 true, i1 [[DOTNOT33]]
-; CHECK-NEXT:    [[TMP29]] = select i1 [[OR_COND]], i32 [[DOTLCSSA364144]], i32 1
+; CHECK-NEXT:    [[OR_COND_FROZEN:%.*]] = freeze i1 [[OR_COND]]
+; CHECK-NEXT:    br i1 [[OR_COND_FROZEN]], label [[SELECT_END]], label [[SELECT_FALSE:%.*]]
+; CHECK:       select.false:
+; CHECK-NEXT:    br label [[SELECT_END]]
+; CHECK:       select.end:
+; CHECK-NEXT:    [[TMP29]] = phi i32 [ [[DOTLCSSA364144]], [[DOTPREHEADER35]] ], [ 1, [[SELECT_FALSE]] ]
+; CHECK-NEXT:    [[DOT2]] = phi i1 [ [[DOT045]], [[DOTPREHEADER35]] ], [ true, [[SELECT_FALSE]] ]
+; CHECK-NEXT:    [[TMP30]] = phi i32 [ [[TMP22]], [[DOTPREHEADER35]] ], [ [[TMP20]], [[SELECT_FALSE]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND:%.*]] = xor i1 [[OR_COND]], true
-; CHECK-NEXT:    [[DOT2]] = select i1 [[NOT_OR_COND]], i1 true, i1 [[DOT045]]
-; CHECK-NEXT:    [[TMP30]] = select i1 [[OR_COND]], i32 [[TMP22]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[IV_N]] = add nuw nsw i64 [[TMP23]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_N]], [[TMP9]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[DOTPREHEADER]], label [[DOTPREHEADER35]]
 ; CHECK:       .preheader:
-; CHECK-NEXT:    [[DOTLCSSA3641_LCSSA:%.*]] = phi i32 [ 0, [[TMP3:%.*]] ], [ [[TMP29]], [[DOTPREHEADER35]] ]
+; CHECK-NEXT:    [[DOTLCSSA3641_LCSSA:%.*]] = phi i32 [ 0, [[TMP3:%.*]] ], [ [[TMP29]], [[SELECT_END]] ]
 ; CHECK-NEXT:    ret i32 [[DOTLCSSA3641_LCSSA]]
 ;
   %4 = getelementptr i8, ptr %0, i64 40
@@ -101,53 +107,106 @@ define i32 @minloc1(ptr nocapture readonly %0, ptr nocapture readonly %1, ptr no
 }
 
 define i32 @minloc1_otherunusednot(ptr nocapture readonly %0, ptr nocapture readonly %1, ptr nocapture readonly %2) {
-; CHECK-LABEL: @minloc1_otherunusednot(
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP0:%.*]], i64 40
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 64
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 80
-; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP0]], i64 88
-; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TMP0]], align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP1:%.*]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = add nsw i64 [[TMP14]], -1
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], [[TMP5]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = shl i64 [[TMP7]], 3
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP2:%.*]], align 4
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp slt i64 [[TMP9]], 1
-; CHECK-NEXT:    br i1 [[DOTNOT]], label [[DOTPREHEADER:%.*]], label [[DOTPREHEADER35_LR_PH:%.*]]
-; CHECK:       .preheader35.lr.ph:
-; CHECK-NEXT:    [[TMP21:%.*]] = sub i64 0, [[TMP7]]
-; CHECK-NEXT:    br label [[DOTPREHEADER35:%.*]]
-; CHECK:       .preheader35:
-; CHECK-NEXT:    [[TMP22:%.*]] = phi i32 [ 2147483647, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP30:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[TMP23:%.*]] = phi i64 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[IV_N:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[DOT045:%.*]] = phi i1 [ false, [[DOTPREHEADER35_LR_PH]] ], [ [[DOT2:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[DOTLCSSA364144:%.*]] = phi i32 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP29:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[TMP24:%.*]] = mul nsw i64 [[TMP23]], [[TMP11]]
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], [[TMP20]]
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp sge i32 [[TMP26]], [[TMP22]]
-; CHECK-NEXT:    [[DOTNOT33:%.*]] = and i1 [[DOT045]], [[TMP28]]
-; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[TMP27]], i1 true, i1 [[DOTNOT33]]
-; CHECK-NEXT:    [[TMP29]] = select i1 [[OR_COND]], i32 [[DOTLCSSA364144]], i32 1
-; CHECK-NEXT:    [[DOT2]] = select i1 [[OR_COND]], i1 [[DOT045]], i1 true
-; CHECK-NEXT:    [[NOT_OR_COND:%.*]] = xor i1 [[OR_COND]], true
-; CHECK-NEXT:    [[TMP30]] = select i1 [[OR_COND]], i32 [[TMP22]], i32 [[TMP20]]
-; CHECK-NEXT:    [[IV_N]] = add nuw nsw i64 [[TMP23]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_N]], [[TMP9]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[DOTPREHEADER]], label [[DOTPREHEADER35]]
-; CHECK:       .preheader:
-; CHECK-NEXT:    [[DOTLCSSA3641_LCSSA:%.*]] = phi i32 [ 0, [[TMP3:%.*]] ], [ [[TMP29]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[P:%.*]] = phi i1 [ false, [[TMP3]] ], [ [[NOT_OR_COND]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[Q:%.*]] = select i1 [[P]], i32 [[DOTLCSSA3641_LCSSA]], i32 1
-; CHECK-NEXT:    ret i32 [[Q]]
+; CHECK-STANDARD-LABEL: @minloc1_otherunusednot(
+; CHECK-STANDARD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP0:%.*]], i64 40
+; CHECK-STANDARD-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-STANDARD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 64
+; CHECK-STANDARD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 8
+; CHECK-STANDARD-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 80
+; CHECK-STANDARD-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-STANDARD-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP0]], i64 88
+; CHECK-STANDARD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP10]], align 8
+; CHECK-STANDARD-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TMP0]], align 8
+; CHECK-STANDARD-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP1:%.*]], align 4
+; CHECK-STANDARD-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-STANDARD-NEXT:    [[TMP15:%.*]] = add nsw i64 [[TMP14]], -1
+; CHECK-STANDARD-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], [[TMP5]]
+; CHECK-STANDARD-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP16]]
+; CHECK-STANDARD-NEXT:    [[TMP18:%.*]] = shl i64 [[TMP7]], 3
+; CHECK-STANDARD-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP18]]
+; CHECK-STANDARD-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP2:%.*]], align 4
+; CHECK-STANDARD-NEXT:    [[DOTNOT:%.*]] = icmp slt i64 [[TMP9]], 1
+; CHECK-STANDARD-NEXT:    br i1 [[DOTNOT]], label [[DOTPREHEADER:%.*]], label [[DOTPREHEADER35_LR_PH:%.*]]
+; CHECK-STANDARD:       .preheader35.lr.ph:
+; CHECK-STANDARD-NEXT:    [[TMP21:%.*]] = sub i64 0, [[TMP7]]
+; CHECK-STANDARD-NEXT:    br label [[DOTPREHEADER35:%.*]]
+; CHECK-STANDARD:       .preheader35:
+; CHECK-STANDARD-NEXT:    [[TMP22:%.*]] = phi i32 [ 2147483647, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP30:%.*]], [[DOTPREHEADER35]] ]
+; CHECK-STANDARD-NEXT:    [[TMP23:%.*]] = phi i64 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[IV_N:%.*]], [[DOTPREHEADER35]] ]
+; CHECK-STANDARD-NEXT:    [[DOT045:%.*]] = phi i1 [ false, [[DOTPREHEADER35_LR_PH]] ], [ [[DOT2:%.*]], [[DOTPREHEADER35]] ]
+; CHECK-STANDARD-NEXT:    [[DOTLCSSA364144:%.*]] = phi i32 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP29:%.*]], [[DOTPREHEADER35]] ]
+; CHECK-STANDARD-NEXT:    [[TMP24:%.*]] = mul nsw i64 [[TMP23]], [[TMP11]]
+; CHECK-STANDARD-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP24]]
+; CHECK-STANDARD-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
+; CHECK-STANDARD-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], [[TMP20]]
+; CHECK-STANDARD-NEXT:    [[TMP28:%.*]] = icmp sge i32 [[TMP26]], [[TMP22]]
+; CHECK-STANDARD-NEXT:    [[DOTNOT33:%.*]] = and i1 [[DOT045]], [[TMP28]]
+; CHECK-STANDARD-NEXT:    [[OR_COND:%.*]] = select i1 [[TMP27]], i1 true, i1 [[DOTNOT33]]
+; CHECK-STANDARD-NEXT:    [[TMP29]] = select i1 [[OR_COND]], i32 [[DOTLCSSA364144]], i32 1
+; CHECK-STANDARD-NEXT:    [[DOT2]] = select i1 [[OR_COND]], i1 [[DOT045]], i1 true
+; CHECK-STANDARD-NEXT:    [[NOT_OR_COND:%.*]] = xor i1 [[OR_COND]], true
+; CHECK-STANDARD-NEXT:    [[TMP30]] = select i1 [[OR_COND]], i32 [[TMP22]], i32 [[TMP20]]
+; CHECK-STANDARD-NEXT:    [[IV_N]] = add nuw nsw i64 [[TMP23]], 1
+; CHECK-STANDARD-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_N]], [[TMP9]]
+; CHECK-STANDARD-NEXT:    br i1 [[EXITCOND_NOT]], label [[DOTPREHEADER]], label [[DOTPREHEADER35]]
+; CHECK-STANDARD:       .preheader:
+; CHECK-STANDARD-NEXT:    [[DOTLCSSA3641_LCSSA:%.*]] = phi i32 [ 0, [[TMP3:%.*]] ], [ [[TMP29]], [[DOTPREHEADER35]] ]
+; CHECK-STANDARD-NEXT:    [[P:%.*]] = phi i1 [ false, [[TMP3]] ], [ [[NOT_OR_COND]], [[DOTPREHEADER35]] ]
+; CHECK-STANDARD-NEXT:    [[Q:%.*]] = select i1 [[P]], i32 [[DOTLCSSA3641_LCSSA]], i32 1
+; CHECK-STANDARD-NEXT:    ret i32 [[Q]]
+;
+; CHECK-FORCED-LABEL: @minloc1_otherunusednot(
+; CHECK-FORCED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP0:%.*]], i64 40
+; CHECK-FORCED-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP4]], align 8
+; CHECK-FORCED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 64
+; CHECK-FORCED-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 8
+; CHECK-FORCED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 80
+; CHECK-FORCED-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-FORCED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP0]], i64 88
+; CHECK-FORCED-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP10]], align 8
+; CHECK-FORCED-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TMP0]], align 8
+; CHECK-FORCED-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP1:%.*]], align 4
+; CHECK-FORCED-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-FORCED-NEXT:    [[TMP15:%.*]] = add nsw i64 [[TMP14]], -1
+; CHECK-FORCED-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], [[TMP5]]
+; CHECK-FORCED-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP16]]
+; CHECK-FORCED-NEXT:    [[TMP18:%.*]] = shl i64 [[TMP7]], 3
+; CHECK-FORCED-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP18]]
+; CHECK-FORCED-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP2:%.*]], align 4
+; CHECK-FORCED-NEXT:    [[DOTNOT:%.*]] = icmp slt i64 [[TMP9]], 1
+; CHECK-FORCED-NEXT:    br i1 [[DOTNOT]], label [[DOTPREHEADER:%.*]], label [[DOTPREHEADER35_LR_PH:%.*]]
+; CHECK-FORCED:       .preheader35.lr.ph:
+; CHECK-FORCED-NEXT:    [[TMP21:%.*]] = sub i64 0, [[TMP7]]
+; CHECK-FORCED-NEXT:    br label [[DOTPREHEADER35:%.*]]
+; CHECK-FORCED:       .preheader35:
+; CHECK-FORCED-NEXT:    [[TMP22:%.*]] = phi i32 [ 2147483647, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP30:%.*]], [[SELECT_END:%.*]] ]
+; CHECK-FORCED-NEXT:    [[TMP23:%.*]] = phi i64 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[IV_N:%.*]], [[SELECT_END]] ]
+; CHECK-FORCED-NEXT:    [[DOT045:%.*]] = phi i1 [ false, [[DOTPREHEADER35_LR_PH]] ], [ [[DOT2:%.*]], [[SELECT_END]] ]
+; CHECK-FORCED-NEXT:    [[DOTLCSSA364144:%.*]] = phi i32 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP29:%.*]], [[SELECT_END]] ]
+; CHECK-FORCED-NEXT:    [[TMP24:%.*]] = mul nsw i64 [[TMP23]], [[TMP11]]
+; CHECK-FORCED-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP24]]
+; CHECK-FORCED-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
+; CHECK-FORCED-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], [[TMP20]]
+; CHECK-FORCED-NEXT:    [[TMP28:%.*]] = icmp sge i32 [[TMP26]], [[TMP22]]
+; CHECK-FORCED-NEXT:    [[DOTNOT33:%.*]] = and i1 [[DOT045]], [[TMP28]]
+; CHECK-FORCED-NEXT:    [[OR_COND:%.*]] = select i1 [[TMP27]], i1 true, i1 [[DOTNOT33]]
+; CHECK-FORCED-NEXT:    [[OR_COND_FROZEN:%.*]] = freeze i1 [[OR_COND]]
+; CHECK-FORCED-NEXT:    br i1 [[OR_COND_FROZEN]], label [[SELECT_END]], label [[SELECT_FALSE:%.*]]
+; CHECK-FORCED:       select.false:
+; CHECK-FORCED-NEXT:    br label [[SELECT_END]]
+; CHECK-FORCED:       select.end:
+; CHECK-FORCED-NEXT:    [[TMP29]] = phi i32 [ [[DOTLCSSA364144]], [[DOTPREHEADER35]] ], [ 1, [[SELECT_FALSE]] ]
+; CHECK-FORCED-NEXT:    [[DOT2]] = phi i1 [ [[DOT045]], [[DOTPREHEADER35]] ], [ true, [[SELECT_FALSE]] ]
+; CHECK-FORCED-NEXT:    [[TMP30]] = phi i32 [ [[TMP22]], [[DOTPREHEADER35]] ], [ [[TMP20]], [[SELECT_FALSE]] ]
+; CHECK-FORCED-NEXT:    [[NOT_OR_COND:%.*]] = xor i1 [[OR_COND]], true
+; CHECK-FORCED-NEXT:    [[IV_N]] = add nuw nsw i64 [[TMP23]], 1
+; CHECK-FORCED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_N]], [[TMP9]]
+; CHECK-FORCED-NEXT:    br i1 [[EXITCOND_NOT]], label [[DOTPREHEADER]], label [[DOTPREHEADER35]]
+; CHECK-FORCED:       .preheader:
+; CHECK-FORCED-NEXT:    [[DOTLCSSA3641_LCSSA:%.*]] = phi i32 [ 0, [[TMP3:%.*]] ], [ [[TMP29]], [[SELECT_END]] ]
+; CHECK-FORCED-NEXT:    [[P:%.*]] = phi i1 [ false, [[TMP3]] ], [ [[NOT_OR_COND]], [[SELECT_END]] ]
+; CHECK-FORCED-NEXT:    [[Q:%.*]] = select i1 [[P]], i32 [[DOTLCSSA3641_LCSSA]], i32 1
+; CHECK-FORCED-NEXT:    ret i32 [[Q]]
 ;
   %4 = getelementptr i8, ptr %0, i64 40
   %5 = load i64, ptr %4, align 8
@@ -225,10 +284,10 @@ define i32 @minloc1_twonot(ptr nocapture readonly %0, ptr nocapture readonly %1,
 ; CHECK-NEXT:    [[TMP21:%.*]] = sub i64 0, [[TMP7]]
 ; CHECK-NEXT:    br label [[DOTPREHEADER35:%.*]]
 ; CHECK:       .preheader35:
-; CHECK-NEXT:    [[TMP22:%.*]] = phi i32 [ 2147483647, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP30:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[TMP23:%.*]] = phi i64 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[IV_N:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[DOT045:%.*]] = phi i1 [ false, [[DOTPREHEADER35_LR_PH]] ], [ [[DOT3:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[DOTLCSSA364144:%.*]] = phi i32 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP29:%.*]], [[DOTPREHEADER35]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi i32 [ 2147483647, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP30:%.*]], [[SELECT_END:%.*]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = phi i64 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[IV_N:%.*]], [[SELECT_END]] ]
+; CHECK-NEXT:    [[DOT045:%.*]] = phi i1 [ false, [[DOTPREHEADER35_LR_PH]] ], [ [[DOT3:%.*]], [[SELECT_END]] ]
+; CHECK-NEXT:    [[DOTLCSSA364144:%.*]] = phi i32 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP29:%.*]], [[SELECT_END]] ]
 ; CHECK-NEXT:    [[TMP24:%.*]] = mul nsw i64 [[TMP23]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP24]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
@@ -236,16 +295,21 @@ define i32 @minloc1_twonot(ptr nocapture readonly %0, ptr nocapture readonly %1,
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp sge i32 [[TMP26]], [[TMP22]]
 ; CHECK-NEXT:    [[DOTNOT33:%.*]] = and i1 [[DOT045]], [[TMP28]]
 ; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[TMP27]], i1 true, i1 [[DOTNOT33]]
-; CHECK-NEXT:    [[TMP29]] = select i1 [[OR_COND]], i32 [[DOTLCSSA364144]], i32 1
+; CHECK-NEXT:    [[OR_COND_FROZEN:%.*]] = freeze i1 [[OR_COND]]
+; CHECK-NEXT:    br i1 [[OR_COND_FROZEN]], label [[SELECT_END]], label [[SELECT_FALSE:%.*]]
+; CHECK:       select.false:
+; CHECK-NEXT:    br label [[SELECT_END]]
+; CHECK:       select.end:
+; CHECK-NEXT:    [[TMP29]] = phi i32 [ [[DOTLCSSA364144]], [[DOTPREHEADER35]] ], [ 1, [[SELECT_FALSE]] ]
+; CHECK-NEXT:    [[DOT2:%.*]] = phi i1 [ [[DOT045]], [[DOTPREHEADER35]] ], [ true, [[SELECT_FALSE]] ]
+; CHECK-NEXT:    [[DOT3]] = phi i1 [ [[DOT045]], [[DOTPREHEADER35]] ], [ true, [[SELECT_FALSE]] ]
+; CHECK-NEXT:    [[TMP30]] = phi i32 [ [[TMP22]], [[DOTPREHEADER35]] ], [ [[TMP20]], [[SELECT_FALSE]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND:%.*]] = xor i1 [[OR_COND]], true
-; CHECK-NEXT:    [[DOT2:%.*]] = select i1 [[NOT_OR_COND]], i1 true, i1 [[DOT045]]
-; CHECK-NEXT:    [[DOT3]] = select i1 [[NOT_OR_COND]], i1 true, i1 [[DOT2]]
-; CHECK-NEXT:    [[TMP30]] = select i1 [[OR_COND]], i32 [[TMP22]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[IV_N]] = add nuw nsw i64 [[TMP23]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_N]], [[TMP9]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[DOTPREHEADER]], label [[DOTPREHEADER35]]
 ; CHECK:       .preheader:
-; CHECK-NEXT:    [[DOTLCSSA3641_LCSSA:%.*]] = phi i32 [ 0, [[TMP3:%.*]] ], [ [[TMP29]], [[DOTPREHEADER35]] ]
+; CHECK-NEXT:    [[DOTLCSSA3641_LCSSA:%.*]] = phi i32 [ 0, [[TMP3:%.*]] ], [ [[TMP29]], [[SELECT_END]] ]
 ; CHECK-NEXT:    ret i32 [[DOTLCSSA3641_LCSSA]]
 ;
   %4 = getelementptr i8, ptr %0, i64 40
@@ -323,10 +387,10 @@ define i32 @minloc1_onenotdependent(ptr nocapture readonly %0, ptr nocapture rea
 ; CHECK-NEXT:    [[TMP21:%.*]] = sub i64 0, [[TMP7]]
 ; CHECK-NEXT:    br label [[DOTPREHEADER35:%.*]]
 ; CHECK:       .preheader35:
-; CHECK-NEXT:    [[TMP22:%.*]] = phi i32 [ 2147483647, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP30:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[TMP23:%.*]] = phi i64 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[IV_N:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[DOT045:%.*]] = phi i1 [ false, [[DOTPREHEADER35_LR_PH]] ], [ [[DOT3:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[DOTLCSSA364144:%.*]] = phi i32 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP29:%.*]], [[DOTPREHEADER35]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi i32 [ 2147483647, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP30:%.*]], [[SELECT_END:%.*]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = phi i64 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[IV_N:%.*]], [[SELECT_END]] ]
+; CHECK-NEXT:    [[DOT045:%.*]] = phi i1 [ false, [[DOTPREHEADER35_LR_PH]] ], [ [[DOT3:%.*]], [[SELECT_END]] ]
+; CHECK-NEXT:    [[DOTLCSSA364144:%.*]] = phi i32 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP29:%.*]], [[SELECT_END]] ]
 ; CHECK-NEXT:    [[TMP24:%.*]] = mul nsw i64 [[TMP23]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP24]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
@@ -334,16 +398,21 @@ define i32 @minloc1_onenotdependent(ptr nocapture readonly %0, ptr nocapture rea
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp sge i32 [[TMP26]], [[TMP22]]
 ; CHECK-NEXT:    [[DOTNOT33:%.*]] = and i1 [[DOT045]], [[TMP28]]
 ; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[TMP27]], i1 true, i1 [[DOTNOT33]]
-; CHECK-NEXT:    [[TMP29]] = select i1 [[OR_COND]], i32 [[DOTLCSSA364144]], i32 1
+; CHECK-NEXT:    [[OR_COND_FROZEN:%.*]] = freeze i1 [[OR_COND]]
+; CHECK-NEXT:    br i1 [[OR_COND_FROZEN]], label [[SELECT_END]], label [[SELECT_FALSE:%.*]]
+; CHECK:       select.false:
+; CHECK-NEXT:    br label [[SELECT_END]]
+; CHECK:       select.end:
+; CHECK-NEXT:    [[TMP29]] = phi i32 [ [[DOTLCSSA364144]], [[DOTPREHEADER35]] ], [ 1, [[SELECT_FALSE]] ]
+; CHECK-NEXT:    [[DOT2:%.*]] = phi i1 [ true, [[DOTPREHEADER35]] ], [ [[DOT045]], [[SELECT_FALSE]] ]
+; CHECK-NEXT:    [[DOT3]] = phi i1 [ true, [[DOTPREHEADER35]] ], [ true, [[SELECT_FALSE]] ]
+; CHECK-NEXT:    [[TMP30]] = phi i32 [ [[TMP22]], [[DOTPREHEADER35]] ], [ [[TMP20]], [[SELECT_FALSE]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND:%.*]] = xor i1 [[OR_COND]], true
-; CHECK-NEXT:    [[DOT2:%.*]] = select i1 [[OR_COND]], i1 true, i1 [[DOT045]]
-; CHECK-NEXT:    [[DOT3]] = select i1 [[NOT_OR_COND]], i1 true, i1 [[DOT2]]
-; CHECK-NEXT:    [[TMP30]] = select i1 [[OR_COND]], i32 [[TMP22]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[IV_N]] = add nuw nsw i64 [[TMP23]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_N]], [[TMP9]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[DOTPREHEADER]], label [[DOTPREHEADER35]]
 ; CHECK:       .preheader:
-; CHECK-NEXT:    [[DOTLCSSA3641_LCSSA:%.*]] = phi i32 [ 0, [[TMP3:%.*]] ], [ [[TMP29]], [[DOTPREHEADER35]] ]
+; CHECK-NEXT:    [[DOTLCSSA3641_LCSSA:%.*]] = phi i32 [ 0, [[TMP3:%.*]] ], [ [[TMP29]], [[SELECT_END]] ]
 ; CHECK-NEXT:    ret i32 [[DOTLCSSA3641_LCSSA]]
 ;
   %4 = getelementptr i8, ptr %0, i64 40
@@ -429,10 +498,10 @@ define i32 @minloc9(ptr nocapture readonly %0, ptr nocapture readonly %1, ptr no
 ; CHECK-NEXT:    [[DOTNEG55:%.*]] = mul i64 [[TMP7]], -8
 ; CHECK-NEXT:    br label [[DOTPREHEADER35:%.*]]
 ; CHECK:       .preheader35:
-; CHECK-NEXT:    [[TMP22:%.*]] = phi i32 [ 2147483647, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP78:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[TMP23:%.*]] = phi i64 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP79:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[DOT045:%.*]] = phi i1 [ false, [[DOTPREHEADER35_LR_PH]] ], [ [[DOT2_8:%.*]], [[DOTPREHEADER35]] ]
-; CHECK-NEXT:    [[DOTLCSSA364144:%.*]] = phi i32 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP77:%.*]], [[DOTPREHEADER35]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = phi i32 [ 2147483647, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP78:%.*]], [[SELECT_END15:%.*]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = phi i64 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP79:%.*]], [[SELECT_END15]] ]
+; CHECK-NEXT:    [[DOT045:%.*]] = phi i1 [ false, [[DOTPREHEADER35_LR_PH]] ], [ [[DOT2_8:%.*]], [[SELECT_END15]] ]
+; CHECK-NEXT:    [[DOTLCSSA364144:%.*]] = phi i32 [ 0, [[DOTPREHEADER35_LR_PH]] ], [ [[TMP77:%.*]], [[SELECT_END15]] ]
 ; CHECK-NEXT:    [[TMP24:%.*]] = mul nsw i64 [[TMP23]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP24]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4
@@ -440,95 +509,140 @@ define i32 @minloc9(ptr nocapture readonly %0, ptr nocapture readonly %1, ptr no
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp sge i32 [[TMP26]], [[TMP22]]
 ; CHECK-NEXT:    [[DOTNOT33:%.*]] = and i1 [[DOT045]], [[TMP28]]
 ; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[TMP27]], i1 true, i1 [[DOTNOT33]]
-; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[OR_COND]], i32 [[DOTLCSSA364144]], i32 1
+; CHECK-NEXT:    [[OR_COND_FROZEN:%.*]] = freeze i1 [[OR_COND]]
+; CHECK-NEXT:    br i1 [[OR_COND_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]]
+; CHECK:       select.false:
+; CHECK-NEXT:    br label [[SELECT_END]]
+; CHECK:       select.end:
+; CHECK-NEXT:    [[TMP29:%.*]] = phi i32 [ [[DOTLCSSA364144]], [[DOTPREHEADER35]] ], [ 1, [[SELECT_FALSE]] ]
+; CHECK-NEXT:    [[DOT2:%.*]] = phi i1 [ [[DOT045]], [[DOTPREHEADER35]] ], [ true, [[SELECT_FALSE]] ]
+; CHECK-NEXT:    [[TMP30:%.*]] = phi i32 [ [[TMP22]], [[DOTPREHEADER35]] ], [ [[TMP20]], [[SELECT_FALSE]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND:%.*]] = xor i1 [[OR_COND]], true
-; CHECK-NEXT:    [[DOT2:%.*]] = select i1 [[NOT_OR_COND]], i1 true, i1 [[DOT045]]
-; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[OR_COND]], i32 [[TMP22]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne i32 [[TMP32]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp sge i32 [[TMP32]], [[TMP30]]
 ; CHECK-NEXT:    [[DOTNOT33_1:%.*]] = and i1 [[DOT2]], [[TMP34]]
 ; CHECK-NEXT:    [[OR_COND_1:%.*]] = select i1 [[TMP33]], i1 true, i1 [[DOTNOT33_1]]
-; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[OR_COND_1]], i32 [[TMP29]], i32 2
+; CHECK-NEXT:    [[OR_COND_1_FROZEN:%.*]] = freeze i1 [[OR_COND_1]]
+; CHECK-NEXT:    br i1 [[OR_COND_1_FROZEN]], label [[SELECT_END1:%.*]], label [[SELECT_FALSE2:%.*]]
+; CHECK:       select.false2:
+; CHECK-NEXT:    br label [[SELECT_END1]]
+; CHECK:       select.end1:
+; CHECK-NEXT:    [[TMP35:%.*]] = phi i32 [ [[TMP29]], [[SELECT_END]] ], [ 2, [[SELECT_FALSE2]] ]
+; CHECK-NEXT:    [[DOT2_1:%.*]] = phi i1 [ [[DOT2]], [[SELECT_END]] ], [ true, [[SELECT_FALSE2]] ]
+; CHECK-NEXT:    [[TMP36:%.*]] = phi i32 [ [[TMP30]], [[SELECT_END]] ], [ [[TMP20]], [[SELECT_FALSE2]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND_1:%.*]] = xor i1 [[OR_COND_1]], true
-; CHECK-NEXT:    [[DOT2_1:%.*]] = select i1 [[NOT_OR_COND_1]], i1 true, i1 [[DOT2]]
-; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[OR_COND_1]], i32 [[TMP30]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[DOTNEG]]
 ; CHECK-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4
 ; CHECK-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP40:%.*]] = icmp sge i32 [[TMP38]], [[TMP36]]
 ; CHECK-NEXT:    [[DOTNOT33_2:%.*]] = and i1 [[DOT2_1]], [[TMP40]]
 ; CHECK-NEXT:    [[OR_COND_2:%.*]] = select i1 [[TMP39]], i1 true, i1 [[DOTNOT33_2]]
-; CHECK-NEXT:    [[TMP41:%.*]] = select i1 [[OR_COND_2]], i32 [[TMP35]], i32 3
+; CHECK-NEXT:    [[OR_COND_2_FROZEN:%.*]] = freeze i1 [[OR_COND_2]]
+; CHECK-NEXT:    br i1 [[OR_COND_2_FROZEN]], label [[SELECT_END3:%.*]], label [[SELECT_FALSE4:%.*]]
+; CHECK:       select.false4:
+; CHECK-NEXT:    br label [[SELECT_END3]]
+; CHECK:       select.end3:
+; CHECK-NEXT:    [[TMP41:%.*]] = phi i32 [ [[TMP35]], [[SELECT_END1]] ], [ 3, [[SELECT_FALSE4]] ]
+; CHECK-NEXT:    [[DOT2_2:%.*]] = phi i1 [ [[DOT2_1]], [[SELECT_END1]] ], [ true, [[SELECT_FALSE4]] ]
+; CHECK-NEXT:    [[TMP42:%.*]] = phi i32 [ [[TMP36]], [[SELECT_END1]] ], [ [[TMP20]], [[SELECT_FALSE4]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND_2:%.*]] = xor i1 [[OR_COND_2]], true
-; CHECK-NEXT:    [[DOT2_2:%.*]] = select i1 [[NOT_OR_COND_2]], i1 true, i1 [[DOT2_1]]
-; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[OR_COND_2]], i32 [[TMP36]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[DOTNEG50]]
 ; CHECK-NEXT:    [[TMP44:%.*]] = load i32, ptr [[TMP43]], align 4
 ; CHECK-NEXT:    [[TMP45:%.*]] = icmp ne i32 [[TMP44]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = icmp sge i32 [[TMP44]], [[TMP42]]
 ; CHECK-NEXT:    [[DOTNOT33_3:%.*]] = and i1 [[DOT2_2]], [[TMP46]]
 ; CHECK-NEXT:    [[OR_COND_3:%.*]] = select i1 [[TMP45]], i1 true, i1 [[DOTNOT33_3]]
-; CHECK-NEXT:    [[TMP47:%.*]] = select i1 [[OR_COND_3]], i32 [[TMP41]], i32 4
+; CHECK-NEXT:    [[OR_COND_3_FROZEN:%.*]] = freeze i1 [[OR_COND_3]]
+; CHECK-NEXT:    br i1 [[OR_COND_3_FROZEN]], label [[SELECT_END5:%.*]], label [[SELECT_FALSE6:%.*]]
+; CHECK:       select.false6:
+; CHECK-NEXT:    br label [[SELECT_END5]]
+; CHECK:       select.end5:
+; CHECK-NEXT:    [[TMP47:%.*]] = phi i32 [ [[TMP41]], [[SELECT_END3]] ], [ 4, [[SELECT_FALSE6]] ]
+; CHECK-NEXT:    [[DOT2_3:%.*]] = phi i1 [ [[DOT2_2]], [[SELECT_END3]] ], [ true, [[SELECT_FALSE6]] ]
+; CHECK-NEXT:    [[TMP48:%.*]] = phi i32 [ [[TMP42]], [[SELECT_END3]] ], [ [[TMP20]], [[SELECT_FALSE6]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND_3:%.*]] = xor i1 [[OR_COND_3]], true
-; CHECK-NEXT:    [[DOT2_3:%.*]] = select i1 [[NOT_OR_COND_3]], i1 true, i1 [[DOT2_2]]
-; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[OR_COND_3]], i32 [[TMP42]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[DOTNEG51]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i32, ptr [[TMP49]], align 4
 ; CHECK-NEXT:    [[TMP51:%.*]] = icmp ne i32 [[TMP50]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp sge i32 [[TMP50]], [[TMP48]]
 ; CHECK-NEXT:    [[DOTNOT33_4:%.*]] = and i1 [[DOT2_3]], [[TMP52]]
 ; CHECK-NEXT:    [[OR_COND_4:%.*]] = select i1 [[TMP51]], i1 true, i1 [[DOTNOT33_4]]
-; CHECK-NEXT:    [[TMP53:%.*]] = select i1 [[OR_COND_4]], i32 [[TMP47]], i32 5
+; CHECK-NEXT:    [[OR_COND_4_FROZEN:%.*]] = freeze i1 [[OR_COND_4]]
+; CHECK-NEXT:    br i1 [[OR_COND_4_FROZEN]], label [[SELECT_END7:%.*]], label [[SELECT_FALSE8:%.*]]
+; CHECK:       select.false8:
+; CHECK-NEXT:    br label [[SELECT_END7]]
+; CHECK:       select.end7:
+; CHECK-NEXT:    [[TMP53:%.*]] = phi i32 [ [[TMP47]], [[SELECT_END5]] ], [ 5, [[SELECT_FALSE8]] ]
+; CHECK-NEXT:    [[DOT2_4:%.*]] = phi i1 [ [[DOT2_3]], [[SELECT_END5]] ], [ true, [[SELECT_FALSE8]] ]
+; CHECK-NEXT:    [[TMP54:%.*]] = phi i32 [ [[TMP48]], [[SELECT_END5]] ], [ [[TMP20]], [[SELECT_FALSE8]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND_4:%.*]] = xor i1 [[OR_COND_4]], true
-; CHECK-NEXT:    [[DOT2_4:%.*]] = select i1 [[NOT_OR_COND_4]], i1 true, i1 [[DOT2_3]]
-; CHECK-NEXT:    [[TMP54:%.*]] = select i1 [[OR_COND_4]], i32 [[TMP48]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[DOTNEG52]]
 ; CHECK-NEXT:    [[TMP56:%.*]] = load i32, ptr [[TMP55]], align 4
 ; CHECK-NEXT:    [[TMP57:%.*]] = icmp ne i32 [[TMP56]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP58:%.*]] = icmp sge i32 [[TMP56]], [[TMP54]]
 ; CHECK-NEXT:    [[DOTNOT33_5:%.*]] = and i1 [[DOT2_4]], [[TMP58]]
 ; CHECK-NEXT:    [[OR_COND_5:%.*]] = select i1 [[TMP57]], i1 true, i1 [[DOTNOT33_5]]
-; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[OR_COND_5]], i32 [[TMP53]], i32 6
+; CHECK-NEXT:    [[OR_COND_5_FROZEN:%.*]] = freeze i1 [[OR_COND_5]]
+; CHECK-NEXT:    br i1 [[OR_COND_5_FROZEN]], label [[SELECT_END9:%.*]], label [[SELECT_FALSE10:%.*]]
+; CHECK:       select.false10:
+; CHECK-NEXT:    br label [[SELECT_END9]]
+; CHECK:       select.end9:
+; CHECK-NEXT:    [[TMP59:%.*]] = phi i32 [ [[TMP53]], [[SELECT_END7]] ], [ 6, [[SELECT_FALSE10]] ]
+; CHECK-NEXT:    [[DOT2_5:%.*]] = phi i1 [ [[DOT2_4]], [[SELECT_END7]] ], [ true, [[SELECT_FALSE10]] ]
+; CHECK-NEXT:    [[TMP60:%.*]] = phi i32 [ [[TMP54]], [[SELECT_END7]] ], [ [[TMP20]], [[SELECT_FALSE10]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND_5:%.*]] = xor i1 [[OR_COND_5]], true
-; CHECK-NEXT:    [[DOT2_5:%.*]] = select i1 [[NOT_OR_COND_5]], i1 true, i1 [[DOT2_4]]
-; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[OR_COND_5]], i32 [[TMP54]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[DOTNEG53]]
 ; CHECK-NEXT:    [[TMP62:%.*]] = load i32, ptr [[TMP61]], align 4
 ; CHECK-NEXT:    [[TMP63:%.*]] = icmp ne i32 [[TMP62]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP64:%.*]] = icmp sge i32 [[TMP62]], [[TMP60]]
 ; CHECK-NEXT:    [[DOTNOT33_6:%.*]] = and i1 [[DOT2_5]], [[TMP64]]
 ; CHECK-NEXT:    [[OR_COND_6:%.*]] = select i1 [[TMP63]], i1 true, i1 [[DOTNOT33_6]]
-; CHECK-NEXT:    [[TMP65:%.*]] = select i1 [[OR_COND_6]], i32 [[TMP59]], i32 7
+; CHECK-NEXT:    [[OR_COND_6_FROZEN:%.*]] = freeze i1 [[OR_COND_6]]
+; CHECK-NEXT:    br i1 [[OR_COND_6_FROZEN]], label [[SELECT_END11:%.*]], label [[SELECT_FALSE12:%.*]]
+; CHECK:       select.false12:
+; CHECK-NEXT:    br label [[SELECT_END11]]
+; CHECK:       select.end11:
+; CHECK-NEXT:    [[TMP65:%.*]] = phi i32 [ [[TMP59]], [[SELECT_END9]] ], [ 7, [[SELECT_FALSE12]] ]
+; CHECK-NEXT:    [[DOT2_6:%.*]] = phi i1 [ [[DOT2_5]], [[SELECT_END9]] ], [ true, [[SELECT_FALSE12]] ]
+; CHECK-NEXT:    [[TMP66:%.*]] = phi i32 [ [[TMP60]], [[SELECT_END9]] ], [ [[TMP20]], [[SELECT_FALSE12]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND_6:%.*]] = xor i1 [[OR_COND_6]], true
-; CHECK-NEXT:    [[DOT2_6:%.*]] = select i1 [[NOT_OR_COND_6]], i1 true, i1 [[DOT2_5]]
-; CHECK-NEXT:    [[TMP66:%.*]] = select i1 [[OR_COND_6]], i32 [[TMP60]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[DOTNEG54]]
 ; CHECK-NEXT:    [[TMP68:%.*]] = load i32, ptr [[TMP67]], align 4
 ; CHECK-NEXT:    [[TMP69:%.*]] = icmp ne i32 [[TMP68]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP70:%.*]] = icmp sge i32 [[TMP68]], [[TMP66]]
 ; CHECK-NEXT:    [[DOTNOT33_7:%.*]] = and i1 [[DOT2_6]], [[TMP70]]
 ; CHECK-NEXT:    [[OR_COND_7:%.*]] = select i1 [[TMP69]], i1 true, i1 [[DOTNOT33_7]]
-; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[OR_COND_7]], i32 [[TMP65]], i32 8
+; CHECK-NEXT:    [[OR_COND_7_FROZEN:%.*]] = freeze i1 [[OR_COND_7]]
+; CHECK-NEXT:    br i1 [[OR_COND_7_FROZEN]], label [[SELECT_END13:%.*]], label [[SELECT_FALSE14:%.*]]
+; CHECK:       select.false14:
+; CHECK-NEXT:    br label [[SELECT_END13]]
+; CHECK:       select.end13:
+; CHECK-NEXT:    [[TMP71:%.*]] = phi i32 [ [[TMP65]], [[SELECT_END11]] ], [ 8, [[SELECT_FALSE14]] ]
+; CHECK-NEXT:    [[DOT2_7:%.*]] = phi i1 [ [[DOT2_6]], [[SELECT_END11]] ], [ true, [[SELECT_FALSE14]] ]
+; CHECK-NEXT:    [[TMP72:%.*]] = phi i32 [ [[TMP66]], [[SELECT_END11]] ], [ [[TMP20]], [[SELECT_FALSE14]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND_7:%.*]] = xor i1 [[OR_COND_7]], true
-; CHECK-NEXT:    [[DOT2_7:%.*]] = select i1 [[NOT_OR_COND_7]], i1 true, i1 [[DOT2_6]]
-; CHECK-NEXT:    [[TMP72:%.*]] = select i1 [[OR_COND_7]], i32 [[TMP66]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[DOTNEG55]]
 ; CHECK-NEXT:    [[TMP74:%.*]] = load i32, ptr [[TMP73]], align 4
 ; CHECK-NEXT:    [[TMP75:%.*]] = icmp ne i32 [[TMP74]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP76:%.*]] = icmp sge i32 [[TMP74]], [[TMP72]]
 ; CHECK-NEXT:    [[DOTNOT33_8:%.*]] = and i1 [[DOT2_7]], [[TMP76]]
 ; CHECK-NEXT:    [[OR_COND_8:%.*]] = select i1 [[TMP75]], i1 true, i1 [[DOTNOT33_8]]
-; CHECK-NEXT:    [[TMP77]] = select i1 [[OR_COND_8]], i32 [[TMP71]], i32 9
+; CHECK-NEXT:    [[OR_COND_8_FROZEN:%.*]] = freeze i1 [[OR_COND_8]]
+; CHECK-NEXT:    br i1 [[OR_COND_8_FROZEN]], label [[SELECT_END15]], label [[SELECT_FALSE16:%.*]]
+; CHECK:       select.false16:
+; CHECK-NEXT:    br label [[SELECT_END15]]
+; CHECK:       select.end15:
+; CHECK-NEXT:    [[TMP77]] = phi i32 [ [[TMP71]], [[SELECT_END13]] ], [ 9, [[SELECT_FALSE16]] ]
+; CHECK-NEXT:    [[DOT2_8]] = phi i1 [ [[DOT2_7]], [[SELECT_END13]] ], [ true, [[SELECT_FALSE16]] ]
+; CHECK-NEXT:    [[TMP78]] = phi i32 [ [[TMP72]], [[SELECT_END13]] ], [ [[TMP20]], [[SELECT_FALSE16]] ]
 ; CHECK-NEXT:    [[NOT_OR_COND_8:%.*]] = xor i1 [[OR_COND_8]], true
-; CHECK-NEXT:    [[DOT2_8]] = select i1 [[NOT_OR_COND_8]], i1 true, i1 [[DOT2_7]]
-; CHECK-NEXT:    [[TMP78]] = select i1 [[OR_COND_8]], i32 [[TMP72]], i32 [[TMP20]]
 ; CHECK-NEXT:    [[TMP79]] = add nuw nsw i64 [[TMP23]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP79]], [[TMP9]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[DOTPREHEADER]], label [[DOTPREHEADER35]]
 ; CHECK:       .preheader:
-; CHECK-NEXT:    [[DOTLCSSA3641_LCSSA:%.*]] = phi i32 [ 0, [[TMP3:%.*]] ], [ [[TMP77]], [[DOTPREHEADER35]] ]
+; CHECK-NEXT:    [[DOTLCSSA3641_LCSSA:%.*]] = phi i32 [ 0, [[TMP3:%.*]] ], [ [[TMP77]], [[SELECT_END15]] ]
 ; CHECK-NEXT:    ret i32 [[DOTLCSSA3641_LCSSA]]
 ;
   %4 = getelementptr i8, ptr %0, i64 40
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
index 8cb8b1c92fa7..8ce24ceb33d7 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
@@ -72,6 +72,203 @@ define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_many_svepred_arg(<
   ret <vscale x 16 x i1> %ret
 }
 
+; Test that arg2 is passed through x0, i.e., x0 = &%arg2; and return values are loaded from x0:
+;     P0 = ldr [x0]
+define aarch64_sve_vector_pcs <vscale x 16 x i1> @callee_with_svepred_arg_4xv16i1_1xv16i1([4 x <vscale x 16 x i1>] %arg1, [1 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: callee_with_svepred_arg_4xv16i1_1xv16i1
+; CHECK:    [[BASE:%[0-9]+]]:gpr64common = COPY $x0
+; CHECK:    [[PRED0:%[0-9]+]]:ppr = LDR_PXI [[BASE]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    $p0 = COPY [[PRED0]]
+; CHECK:    RET_ReallyLR implicit $p0
+  %res = extractvalue [1 x <vscale x 16 x i1>] %arg2, 0
+  ret <vscale x 16 x i1> %res
+}
+
+; Test that arg1 is stored to the stack from p0; and the stack location is passed throuch x0 to setup the call:
+;     str P0, [stack_loc_for_args]
+;     x0 = stack_loc_for_args
+define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_svepred_arg_1xv16i1_4xv16i1([1 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: caller_with_svepred_arg_1xv16i1_4xv16i1
+; CHECK: stack:
+; CHECK:      - { id: 0, name: '', type: default, offset: 0, size: 2, alignment: 2,
+; CHECK-NEXT:     stack-id: scalable-vector,
+; CHECK:    [[PRED0:%[0-9]+]]:ppr = COPY $p0
+; CHECK:    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+; CHECK:    STR_PXI [[PRED0]], %stack.0, 0 :: (store (<vscale x 1 x s16>) into %stack.0)
+; CHECK:    [[STACK:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0, 0
+; CHECK:    $x0 = COPY [[STACK]]
+; CHECK:    BL @callee_with_svepred_arg_4xv16i1_1xv16i1, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $p0, implicit $p1, implicit $p2, implicit $p3, implicit $x0, implicit-def $sp, implicit-def $p0
+; CHECK:    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  %res = call <vscale x 16 x i1> @callee_with_svepred_arg_4xv16i1_1xv16i1([4 x <vscale x 16 x i1>] %arg2, [1 x <vscale x 16 x i1>] %arg1)
+  ret <vscale x 16 x i1> %res
+}
+
+; Test that arg2 is passed through x0, i.e., x0 = &%arg2; and return values are loaded from x0:
+;     P0 = ldr [x0]
+;     P1 = ldr [x0 +   sizeof(Px)]
+;     P2 = ldr [x0 + 2*sizeof(Px)]
+;     P3 = ldr [x0 + 3*sizeof(Px)]
+define aarch64_sve_vector_pcs [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_4xv16i1_4xv16i1([4 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: callee_with_svepred_arg_4xv16i1_4xv16i1
+; CHECK:    [[BASE:%[0-9]+]]:gpr64common = COPY $x0
+; CHECK:    [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[BASE]], killed [[OFFSET1]]
+; CHECK:    [[PRED1:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR1]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET2]]
+; CHECK:    [[PRED2:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR2]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK:    [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET3]]
+; CHECK:    [[PRED3:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR3]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[PRED0:%[0-9]+]]:ppr = LDR_PXI [[BASE]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    $p0 = COPY [[PRED0]]
+; CHECK:    $p1 = COPY [[PRED1]]
+; CHECK:    $p2 = COPY [[PRED2]]
+; CHECK:    $p3 = COPY [[PRED3]]
+; CHECK:    RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+  ret [4 x <vscale x 16 x i1>] %arg2
+}
+
+; Test that arg1 is stored to the stack from p0~p3; and the stack location is passed throuch x0 to setup the call:
+;     str P0, [stack_loc_for_args]
+;     str P1, [stack_loc_for_args +   sizeof(Px)]
+;     str P2, [stack_loc_for_args + 2*sizeof(Px)]
+;     str P3, [stack_loc_for_args + 3*sizeof(Px)]
+;     x0 = stack_loc_for_args
+define [4 x <vscale x 16 x i1>] @caller_with_svepred_arg_4xv16i1_4xv16i1([4 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: caller_with_svepred_arg_4xv16i1_4xv16i1
+; CHECK: stack:
+; CHECK:      - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2,
+; CHECK-NEXT:     stack-id: scalable-vector,
+; CHECK:    [[PRED3:%[0-9]+]]:ppr = COPY $p3
+; CHECK:    [[PRED2:%[0-9]+]]:ppr = COPY $p2
+; CHECK:    [[PRED1:%[0-9]+]]:ppr = COPY $p1
+; CHECK:    [[PRED0:%[0-9]+]]:ppr = COPY $p0
+; CHECK:    [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK:    [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK:    [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK:    [[STACK:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0, 0
+; CHECK:    [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], [[OFFSET3]]
+; CHECK:    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+; CHECK:    STR_PXI [[PRED3]], killed [[ADDR3]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK:    [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], [[OFFSET2]]
+; CHECK:    STR_PXI [[PRED2]], killed [[ADDR2]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK:    [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[STACK]], [[OFFSET1]]
+; CHECK:    STR_PXI [[PRED1]], killed [[ADDR1]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK:    STR_PXI [[PRED0]], %stack.0, 0 :: (store (<vscale x 1 x s16>) into %stack.0)
+; CHECK:    $x0 = COPY [[STACK]]
+; CHECK:    BL @callee_with_svepred_arg_4xv16i1_4xv16i1, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $p0, implicit $p1, implicit $p2, implicit $p3, implicit $x0, implicit-def $sp, implicit-def $p0, implicit-def $p1, implicit-def $p2, implicit-def $p3
+; CHECK:    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  %res = call [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_4xv16i1_4xv16i1([4 x <vscale x 16 x i1>] %arg2, [4 x <vscale x 16 x i1>] %arg1)
+  ret [4 x <vscale x 16 x i1>] %res
+}
+
+; Test that arg2 is passed through x0, i.e., x0 = &%arg2; and return values are loaded from x0:
+;     P0 = ldr [x0]
+;     P1 = ldr [x0 +   sizeof(Px)]
+;     P2 = ldr [x0 + 2*sizeof(Px)]
+;     P3 = ldr [x0 + 3*sizeof(Px)]
+define aarch64_sve_vector_pcs [2 x <vscale x 32 x i1>] @callee_with_svepred_arg_1xv16i1_2xv32i1([1 x <vscale x 16 x i1>] %arg1, [2 x <vscale x 32 x i1>] %arg2) {
+; CHECK: name: callee_with_svepred_arg_1xv16i1_2xv32i1
+; CHECK:    [[BASE:%[0-9]+]]:gpr64common = COPY $x0
+; CHECK:    [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[BASE]], killed [[OFFSET1]]
+; CHECK:    [[PRED1:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR1]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET2]]
+; CHECK:    [[PRED2:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR2]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK:    [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[BASE]], killed [[OFFSET3]]
+; CHECK:    [[PRED3:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR3]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[PRED0:%[0-9]+]]:ppr = LDR_PXI [[BASE]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    $p0 = COPY [[PRED0]]
+; CHECK:    $p1 = COPY [[PRED1]]
+; CHECK:    $p2 = COPY [[PRED2]]
+; CHECK:    $p3 = COPY [[PRED3]]
+; CHECK:    RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+  ret [2 x <vscale x 32 x i1>] %arg2
+}
+
+; Test that arg1 is stored to the stack from p0~p3; and the stack location is passed throuch x0 to setup the call:
+;     str P0, [stack_loc_for_args]
+;     str P1, [stack_loc_for_args +   sizeof(Px)]
+;     str P2, [stack_loc_for_args + 2*sizeof(Px)]
+;     str P3, [stack_loc_for_args + 3*sizeof(Px)]
+;     x0 = stack_loc_for_args
+define [2 x <vscale x 32 x i1>] @caller_with_svepred_arg_2xv32i1_1xv16i1([2 x <vscale x 32 x i1>] %arg1, [1 x <vscale x 16 x i1>] %arg2) {
+; CHECK: name: caller_with_svepred_arg_2xv32i1_1xv16i1
+; CHECK: stack:
+; CHECK:      - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2,
+; CHECK-NEXT:     stack-id: scalable-vector,
+; CHECK:    [[PRED3:%[0-9]+]]:ppr = COPY $p3
+; CHECK:    [[PRED2:%[0-9]+]]:ppr = COPY $p2
+; CHECK:    [[PRED1:%[0-9]+]]:ppr = COPY $p1
+; CHECK:    [[PRED0:%[0-9]+]]:ppr = COPY $p0
+; CHECK:    [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK:    [[STACK:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0, 0
+; CHECK:    [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], killed [[OFFSET3]]
+; CHECK:    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+; CHECK:    STR_PXI [[PRED3]], killed [[ADDR3]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[STACK]], killed [[OFFSET2]]
+; CHECK:    STR_PXI [[PRED2]], killed [[ADDR2]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[STACK]], killed [[OFFSET1]]
+; CHECK:    STR_PXI [[PRED1]], killed [[ADDR1]], 0 :: (store (<vscale x 1 x s16>))
+; CHECK:    STR_PXI [[PRED0]], %stack.0, 0 :: (store (<vscale x 1 x s16>) into %stack.0)
+; CHECK:    $x0 = COPY [[STACK]]
+; CHECK:    BL @callee_with_svepred_arg_1xv16i1_2xv32i1, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $p0, implicit $x0, implicit-def $sp, implicit-def $p0, implicit-def $p1, implicit-def $p2, implicit-def $p3
+; CHECK:    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+  %res = call [2 x <vscale x 32 x i1>] @callee_with_svepred_arg_1xv16i1_2xv32i1([1 x <vscale x 16 x i1>] %arg2, [2 x <vscale x 32 x i1>] %arg1)
+  ret [2 x <vscale x 32 x i1>] %res
+}
+
+; Test that arg1 and arg3 are passed via P0~P3, arg1 is passed indirectly through address on stack in x0
+define aarch64_sve_vector_pcs [4 x <vscale x 16 x i1>] @callee_with_svepred_arg_2xv16i1_4xv16i1_2xv16i1([2 x <vscale x 16 x i1>] %arg1, [4 x <vscale x 16 x i1>] %arg2, [2 x <vscale x 16 x i1>] %arg3) nounwind {
+; CHECK: name: callee_with_svepred_arg_2xv16i1_4xv16i1_2xv16i1
+; CHECK:    [[P3:%[0-9]+]]:ppr = COPY $p3
+; CHECK:    [[P2:%[0-9]+]]:ppr = COPY $p2
+; CHECK:    [[X0:%[0-9]+]]:gpr64common = COPY $x0
+; CHECK:    [[P1:%[0-9]+]]:ppr = COPY $p1
+; CHECK:    [[P0:%[0-9]+]]:ppr = COPY $p0
+; CHECK:    [[OFFSET3:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 3, implicit $vg
+; CHECK:    [[ADDR3:%[0-9]+]]:gpr64common = ADDXrr [[X0]], killed [[OFFSET3]]
+; CHECK:    [[P7:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR3]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET2:%[0-9]+]]:gpr64 = CNTW_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR2:%[0-9]+]]:gpr64common = ADDXrr [[X0]], killed [[OFFSET2]]
+; CHECK:    [[P6:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR2]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[OFFSET1:%[0-9]+]]:gpr64 = CNTD_XPiI 31, 1, implicit $vg
+; CHECK:    [[ADDR1:%[0-9]+]]:gpr64common = nuw ADDXrr [[X0]], killed [[OFFSET1]]
+; CHECK:    [[P5:%[0-9]+]]:ppr = LDR_PXI killed [[ADDR1]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[P4:%[0-9]+]]:ppr = LDR_PXI [[X0]], 0 :: (load (<vscale x 1 x s16>))
+; CHECK:    [[RES0:%[0-9]+]]:ppr = AND_PPzPP [[P0]], [[P0]], killed [[P4]]
+; CHECK:    [[RES1:%[0-9]+]]:ppr = AND_PPzPP [[P1]], [[P1]], killed [[P5]]
+; CHECK:    [[RES2:%[0-9]+]]:ppr = AND_PPzPP [[P2]], [[P2]], killed [[P6]]
+; CHECK:    [[RES3:%[0-9]+]]:ppr = AND_PPzPP [[P3]], [[P3]], killed [[P7]]
+; CHECK:    $p0 = COPY [[RES0]]
+; CHECK:    $p1 = COPY [[RES1]]
+; CHECK:    $p2 = COPY [[RES2]]
+; CHECK:    $p3 = COPY [[RES3]]
+; CHECK:    RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+  %p0 = extractvalue [2 x <vscale x 16 x i1>] %arg1, 0
+  %p1 = extractvalue [2 x <vscale x 16 x i1>] %arg1, 1
+  %p2 = extractvalue [2 x <vscale x 16 x i1>] %arg3, 0
+  %p3 = extractvalue [2 x <vscale x 16 x i1>] %arg3, 1
+  %p4 = extractvalue [4 x <vscale x 16 x i1>] %arg2, 0
+  %p5 = extractvalue [4 x <vscale x 16 x i1>] %arg2, 1
+  %p6 = extractvalue [4 x <vscale x 16 x i1>] %arg2, 2
+  %p7 = extractvalue [4 x <vscale x 16 x i1>] %arg2, 3
+  %r0 = and <vscale x 16 x i1> %p0, %p4
+  %r1 = and <vscale x 16 x i1> %p1, %p5
+  %r2 = and <vscale x 16 x i1> %p2, %p6
+  %r3 = and <vscale x 16 x i1> %p3, %p7
+  %1 = insertvalue  [4 x <vscale x 16 x i1>] undef, <vscale x 16 x i1> %r0, 0
+  %2 = insertvalue  [4 x <vscale x 16 x i1>]    %1, <vscale x 16 x i1> %r1, 1
+  %3 = insertvalue  [4 x <vscale x 16 x i1>]    %2, <vscale x 16 x i1> %r2, 2
+  %4 = insertvalue  [4 x <vscale x 16 x i1>]    %3, <vscale x 16 x i1> %r3, 3
+  ret [4 x <vscale x 16 x i1>] %4
+}
+
 ; Test that z8 and z9, passed by reference, are loaded from a location that is passed on the stack.
 ; i.e.     x0 =   %x0
 ;             :
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
index 0a45244f12be..bfb750517cbf 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention.ll
@@ -128,6 +128,52 @@ define <vscale x 4 x i1> @sve_signature_pred(<vscale x 4 x i1> %arg1, <vscale x
   ret <vscale x 4 x i1> %arg2
 }
 
+; Test that scalable predicate argument in [1 x <vscale x 4 x i1>] type are properly assigned to P registers.
+; CHECK-LABEL: name: sve_signature_pred_1xv4i1
+; CHECK: [[RES:%[0-9]+]]:ppr = COPY $p1
+; CHECK: $p0 = COPY [[RES]]
+; CHECK: RET_ReallyLR implicit $p0
+define [1 x <vscale x 4 x i1>] @sve_signature_pred_1xv4i1([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
+  ret [1 x <vscale x 4 x i1>] %arg2
+}
+
+; Test that upto to two scalable predicate arguments in [2 x <vscale x 4 x i1>] type can be assigned to P registers.
+; CHECK-LABEL: name: sve_signature_pred_2xv4i1
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p3
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p2
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind {
+  ret [2 x <vscale x 4 x i1>] %arg2
+}
+
+; Test that a scalable predicate argument in [1 x <vscale x 32 x i1>] type is assigned to two P registers.
+; CHECK-LABLE: name: sve_signature_pred_1xv32i1
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p3
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p2
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [1 x <vscale x 32 x i1>] @sve_signature_pred_1xv32i1([1 x <vscale x 32 x i1>] %arg1, [1 x <vscale x 32 x i1>] %arg2) nounwind {
+  ret [1 x <vscale x 32 x i1>] %arg2
+}
+
+; Test that a scalable predicate argument in [2 x <vscale x 32 x i1>] type is assigned to four P registers.
+; CHECK-LABLE: name: sve_signature_pred_2xv32i1
+; CHECK: [[RES3:%[0-9]+]]:ppr = COPY $p3
+; CHECK: [[RES2:%[0-9]+]]:ppr = COPY $p2
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: $p2 = COPY [[RES2]]
+; CHECK: $p3 = COPY [[RES3]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+define [2 x <vscale x 32 x i1>] @sve_signature_pred_2xv32i1([2 x <vscale x 32 x i1>] %arg1) nounwind {
+  ret [2 x <vscale x 32 x i1>] %arg1
+}
+
 ; CHECK-LABEL: name: sve_signature_vec_caller
 ; CHECK-DAG: [[ARG2:%[0-9]+]]:zpr = COPY $z1
 ; CHECK-DAG: [[ARG1:%[0-9]+]]:zpr = COPY $z0
@@ -156,6 +202,84 @@ define <vscale x 4 x i1> @sve_signature_pred_caller(<vscale x 4 x i1> %arg1, <vs
   ret <vscale x 4 x i1> %res
 }
 
+; CHECK-LABEL: name: sve_signature_pred_1xv4i1_caller
+; CHECK-DAG: [[ARG2:%[0-9]+]]:ppr = COPY $p1
+; CHECK-DAG: [[ARG1:%[0-9]+]]:ppr = COPY $p0
+; CHECK-DAG: $p0 = COPY [[ARG2]]
+; CHECK-DAG: $p1 = COPY [[ARG1]]
+; CHECK-NEXT: BL @sve_signature_pred_1xv4i1, csr_aarch64_sve_aapcs
+; CHECK: [[RES:%[0-9]+]]:ppr = COPY $p0
+; CHECK: $p0 = COPY [[RES]]
+; CHECK: RET_ReallyLR implicit $p0
+define [1 x <vscale x 4 x i1>] @sve_signature_pred_1xv4i1_caller([1 x <vscale x 4 x i1>] %arg1, [1 x <vscale x 4 x i1>] %arg2) nounwind {
+  %res = call [1 x <vscale x 4 x i1>] @sve_signature_pred_1xv4i1([1 x <vscale x 4 x i1>] %arg2, [1 x <vscale x 4 x i1>] %arg1)
+  ret [1 x <vscale x 4 x i1>] %res
+}
+
+; CHECK-LABEL: name: sve_signature_pred_2xv4i1_caller
+; CHECK-DAG: [[ARG2_2:%[0-9]+]]:ppr = COPY $p3
+; CHECK-DAG: [[ARG2_1:%[0-9]+]]:ppr = COPY $p2
+; CHECK-DAG: [[ARG1_2:%[0-9]+]]:ppr = COPY $p1
+; CHECK-DAG: [[ARG1_1:%[0-9]+]]:ppr = COPY $p0
+; CHECK-DAG: $p0 = COPY [[ARG2_1]]
+; CHECK-DAG: $p1 = COPY [[ARG2_2]]
+; CHECK-DAG: $p2 = COPY [[ARG1_1]]
+; CHECK-DAG: $p3 = COPY [[ARG1_2]]
+; CHECK-NEXT: BL @sve_signature_pred_2xv4i1, csr_aarch64_sve_aapcs
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1_caller([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind {
+  %res = call [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1([2 x <vscale x 4 x i1>] %arg2, [2 x <vscale x 4 x i1>] %arg1)
+  ret [2 x <vscale x 4 x i1>] %res
+}
+
+; CHECK-LABEL: name: sve_signature_pred_1xv32i1_caller
+; CHECK-DAG: [[ARG2_2:%[0-9]+]]:ppr = COPY $p3
+; CHECK-DAG: [[ARG2_1:%[0-9]+]]:ppr = COPY $p2
+; CHECK-DAG: [[ARG1_2:%[0-9]+]]:ppr = COPY $p1
+; CHECK-DAG: [[ARG1_1:%[0-9]+]]:ppr = COPY $p0
+; CHECK-DAG: $p0 = COPY [[ARG2_1]]
+; CHECK-DAG: $p1 = COPY [[ARG2_2]]
+; CHECK-DAG: $p2 = COPY [[ARG1_1]]
+; CHECK-DAG: $p3 = COPY [[ARG1_2]]
+; CHECK-NEXT: BL @sve_signature_pred_1xv32i1, csr_aarch64_sve_aapcs
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [1 x <vscale x 32 x i1>] @sve_signature_pred_1xv32i1_caller([1 x <vscale x 32 x i1>] %arg1, [1 x <vscale x 32 x i1>] %arg2) nounwind {
+  %res = call [1 x <vscale x 32 x i1>] @sve_signature_pred_1xv32i1([1 x <vscale x 32 x i1>] %arg2, [1 x <vscale x 32 x i1>] %arg1)
+  ret [1 x <vscale x 32 x i1>] %res
+}
+
+; CHECK-LABEL: name: sve_signature_pred_2xv32i1_caller
+; CHECK-DAG: [[ARG3:%[0-9]+]]:ppr = COPY $p3
+; CHECK-DAG: [[ARG2:%[0-9]+]]:ppr = COPY $p2
+; CHECK-DAG: [[ARG1:%[0-9]+]]:ppr = COPY $p1
+; CHECK-DAG: [[ARG0:%[0-9]+]]:ppr = COPY $p0
+; CHECK-DAG: $p0 = COPY [[ARG0]]
+; CHECK-DAG: $p1 = COPY [[ARG1]]
+; CHECK-DAG: $p2 = COPY [[ARG2]]
+; CHECK-DAG: $p3 = COPY [[ARG3]]
+; CHECK-NEXT: BL @sve_signature_pred_2xv32i1, csr_aarch64_sve_aapcs
+; CHECK: [[RES0:%[0-9]+]]:ppr = COPY $p0
+; CHECK: [[RES1:%[0-9]+]]:ppr = COPY $p1
+; CHECK: [[RES2:%[0-9]+]]:ppr = COPY $p2
+; CHECK: [[RES3:%[0-9]+]]:ppr = COPY $p3
+; CHECK: $p0 = COPY [[RES0]]
+; CHECK: $p1 = COPY [[RES1]]
+; CHECK: $p2 = COPY [[RES2]]
+; CHECK: $p3 = COPY [[RES3]]
+; CHECK: RET_ReallyLR implicit $p0, implicit $p1
+define [2 x <vscale x 32 x i1>] @sve_signature_pred_2xv32i1_caller([2 x <vscale x 32 x i1>] %arg1) {
+  %res = call [2 x <vscale x 32 x i1>] @sve_signature_pred_2xv32i1([2 x <vscale x 32 x i1>] %arg1)
+  ret [2 x <vscale x 32 x i1>] %res
+}
+
 ; Test that functions returning or taking SVE arguments use the correct
 ; callee-saved set when using the default C calling convention (as opposed
 ; to aarch64_sve_vector_pcs)
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
index 28094c7b68e7..276f23703df3 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128  < %s | FileCheck %s -check-prefixes=CHECK,SVE2_128
-; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefixes=CHECK,SVE2_128_NOMAX
-; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve < %s | FileCheck %s -check-prefixes=CHECK,SVE2_NOMIN_NOMAX
-; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,SVE2_MIN_256_NOMAX
+; RUN: llc -mattr=+sve2 -force-streaming-compatible -aarch64-sve-vector-bits-min=128 -aarch64-sve-vector-bits-max=128  < %s | FileCheck %s -check-prefixes=CHECK,SVE2_128
+; RUN: llc -mattr=+sve2 -force-streaming-compatible -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -check-prefixes=CHECK,SVE2_128_NOMAX
+; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s -check-prefixes=CHECK,SVE2_NOMIN_NOMAX
+; RUN: llc -mattr=+sve2 -force-streaming-compatible -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,SVE2_MIN_256_NOMAX
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll b/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll
index 1a2ab8d4253a..b0b6a6a530dd 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mattr=+sve < %s | FileCheck %s
 
 ; Streaming-compatible SVE doesn't include FADDA, so this shouldn't compile!
-; RUN: not --crash llc -mattr=+sve -force-streaming-compatible-sve < %s
+; RUN: not --crash llc -mattr=+sve -force-streaming-compatible < %s
 
 target triple = "aarch64-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-pr92779.ll b/llvm/test/CodeGen/AArch64/sve-pr92779.ll
new file mode 100644
index 000000000000..e25794817add
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-pr92779.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s | FileCheck %s
+
+define void @main(ptr %0) {
+; CHECK-LABEL: main:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d, vl1
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
+; CHECK-NEXT:    uzp1 v1.2s, v0.2s, v1.2s
+; CHECK-NEXT:    neg v1.2s, v1.2s
+; CHECK-NEXT:    smov x8, v1.s[0]
+; CHECK-NEXT:    smov x9, v1.s[1]
+; CHECK-NEXT:    mov z0.d, p0/m, x8
+; CHECK-NEXT:    mov z0.d, p0/m, x9
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ret
+"entry":
+  %1 = bitcast <vscale x 2 x i64> zeroinitializer to <vscale x 4 x i32>
+  %a = extractelement <vscale x 4 x i32> %1, i64 0
+  %b = insertelement <2 x i32> zeroinitializer, i32 %a, i64 0
+  %2 = bitcast <vscale x 2 x i64> zeroinitializer to <vscale x 4 x i32>
+  %c = extractelement <vscale x 4 x i32> %2, i64 2
+  %d = insertelement <2 x i32> %b, i32 %c, i64 1
+  %e = sub <2 x i32> zeroinitializer, %d
+  %f = extractelement <2 x i32> %e, i64 0
+  %g = sext i32 %f to i64
+  %h = insertelement <vscale x 2 x i64> zeroinitializer, i64 %g, i64 0
+  %i = extractelement <2 x i32> %e, i64 1
+  %j = sext i32 %i to i64
+  %k = insertelement <vscale x 2 x i64> %h, i64 %j, i64 0
+  store <vscale x 2 x i64> %k, ptr %0, align 16
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
index d81f725eaefc..e843537c10a3 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
index d547f99a0230..aa42d5c2a8c1 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
index e3cc74f766ee..260ad16581f1 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
index 74a4aab15597..9a07bd8bd5ac 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
index 0c490a662a79..aec434b4819d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
index 86494c4be501..82e75d6efda3 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 0aefba2d4c6a..040e5861e981 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
index 25ecd7a8d7e3..45a804becbc5 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
index a752e119b2fb..9c3b5e14289d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
index f017eead92cf..21ce689f68e2 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE
-; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE
+; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
index c2d6ed4e9ccf..b0a82e699939 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
index 465cc179a3b9..cbd0ad66fba7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
index 9bdde14e8d83..57d072a7bcd6 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
index 244a40510173..6a2dc3c71825 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
index cbe71d715a8f..153a04f48657 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
index 94a74763aa0e..6945a6102c05 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
index b56e67d95ba0..e239ff5e35fd 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sme-fa64 -force-streaming-compatible-sve < %s | FileCheck %s -check-prefix=FA64
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s -check-prefix=NO-FA64
+; RUN: llc -mattr=+sme-fa64 -force-streaming-compatible < %s | FileCheck %s -check-prefix=FA64
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s -check-prefix=NO-FA64
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
index df9613a30e40..78ae7bb6cf30 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
index 7ddc641f366c..412c27cb82f1 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
index 7d36925fdc57..89697cde848b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index bf8a335a8503..5840ffb20994 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
index 30a4f04a3d2b..c1c7b5c05f5d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
index 4aa965777c74..ff38db8c10c0 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
index 8baa87c6d686..ee1706bc7c35 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE
-; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE
+; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
index 73c1eac99dd3..c2f3bbfb51dd 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
index 5158dda37a8b..e6fd775b4cfb 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE
-; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE
+; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
index c7a89612d278..e40668a8696e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE
-; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE
+; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
index f028b3eeca25..54276bb4ba01 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
index 4d70c1dd1c91..40824ba9ae9c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
index 50cf9b73d9a7..74ee5482a60c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
index 149ad6d1e267..3ff6983210a0 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sme-fa64 -force-streaming-compatible-sve < %s | FileCheck %s -check-prefix=FA64
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s -check-prefix=NO-FA64
+; RUN: llc -mattr=+sme-fa64 -force-streaming-compatible < %s | FileCheck %s -check-prefix=FA64
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s -check-prefix=NO-FA64
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
index cb7fa53eac51..8917f43002da 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE
-; RUN: llc -mattr=+sve2 -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE
+; RUN: llc -mattr=+sve2 -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 
 ; This test only tests the legal types for a given vector width, as mulh nodes
 ; do not get generated for non-legal types.
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
index 751f43768a51..1123907f3389 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
index d373a9063f85..4ae7586fca16 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
index 906112f7ac39..bfffe4b6315d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
index 9ed52e321d9a..9319bd69c25f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index a9b52c93006d..27dbfc9a23a8 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
index 81bbaa92d4b4..3775a64a89a0 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
index 318285ded5a8..0b6152340f65 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
index 885030861469..918f0ccc0cf6 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve  < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible  < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
index 8ca8e6980913..8c69d5b0bb37 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
index c4aeb4465c53..ef52eadc5d3b 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
index ca58099244cf..4f8f8c2e4b24 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index f2b3f9b12ea7..bd6b96889b4c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
index b5adea594242..aef446a90df6 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
index 00413302798c..6d91253caae5 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
index cb73030306b0..8808ad9a23d7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
index ab7c42b3e9e3..8039bd096bcb 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
index 362612518787..9741147b332e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reshuffle.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
index bfa931044bc5..726fd28c90ae 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
index 9dd42e7831e0..c022bf85e67e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
index 6f82c97f3b87..38aaf860b729 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
index 323d5278592f..649b13fa8a1e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 
 
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
index 06709ca3685c..c7435bdbec94 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
index 838db0ce8185..9e04fc236836 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 ; Test we can code generater patterns of the form:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
index 7e3a175c40d2..b34fe438a063 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
index 70219dd30f76..9e56462df388 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
index 175731480407..304823c9e641 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
index 337a2134de5b..6c9c05560566 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s
 
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
index 18cd4cc2111a..c4a58ba12dc6 100644
--- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
@@ -571,29 +571,27 @@ define void @trunc_v8i19_to_v8i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  LBB5_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ldp x10, x9, [x0]
-; CHECK-NEXT:    ldrb w13, [x0, #18]
-; CHECK-NEXT:    ldrh w14, [x0, #16]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldrb w14, [x0, #18]
+; CHECK-NEXT:    ldrh w15, [x0, #16]
 ; CHECK-NEXT:    add x0, x0, #32
-; CHECK-NEXT:    ubfx x12, x9, #12, #20
-; CHECK-NEXT:    fmov s0, w10
-; CHECK-NEXT:    lsr x11, x10, #19
-; CHECK-NEXT:    lsr x15, x9, #31
-; CHECK-NEXT:    fmov s1, w12
-; CHECK-NEXT:    lsr x12, x9, #50
-; CHECK-NEXT:    mov.s v0[1], w11
-; CHECK-NEXT:    orr w11, w14, w13, lsl #16
-; CHECK-NEXT:    lsr x13, x10, #38
-; CHECK-NEXT:    lsr x10, x10, #57
-; CHECK-NEXT:    mov.s v1[1], w15
-; CHECK-NEXT:    orr w12, w12, w11, lsl #14
-; CHECK-NEXT:    orr w9, w10, w9, lsl #7
-; CHECK-NEXT:    lsr w10, w11, #5
-; CHECK-NEXT:    mov.s v0[2], w13
+; CHECK-NEXT:    ubfx x12, x10, #12, #20
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    lsr x11, x9, #19
+; CHECK-NEXT:    lsr x13, x10, #31
+; CHECK-NEXT:    fmov s0, w12
+; CHECK-NEXT:    lsr x12, x9, #38
+; CHECK-NEXT:    extr x9, x10, x9, #57
+; CHECK-NEXT:    mov.s v1[1], w11
+; CHECK-NEXT:    orr x11, x15, x14, lsl #16
+; CHECK-NEXT:    mov.s v0[1], w13
+; CHECK-NEXT:    extr x13, x11, x10, #50
+; CHECK-NEXT:    ubfx x10, x11, #5, #27
 ; CHECK-NEXT:    mov.s v1[2], w12
-; CHECK-NEXT:    mov.s v0[3], w9
-; CHECK-NEXT:    mov.s v1[3], w10
-; CHECK-NEXT:    uzp1.8h v0, v0, v1
+; CHECK-NEXT:    mov.s v0[2], w13
+; CHECK-NEXT:    mov.s v1[3], w9
+; CHECK-NEXT:    mov.s v0[3], w10
+; CHECK-NEXT:    uzp1.8h v0, v1, v0
 ; CHECK-NEXT:    xtn.8b v0, v0
 ; CHECK-NEXT:    str d0, [x1, x8, lsl #3]
 ; CHECK-NEXT:    add x8, x8, #1
@@ -608,35 +606,34 @@ define void @trunc_v8i19_to_v8i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-BE-NEXT:  .LBB5_1: // %loop
 ; CHECK-BE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-BE-NEXT:    ldp x10, x9, [x0]
-; CHECK-BE-NEXT:    ldrb w16, [x0, #18]
-; CHECK-BE-NEXT:    lsr x11, x9, #40
-; CHECK-BE-NEXT:    ubfx x12, x9, #33, #7
-; CHECK-BE-NEXT:    lsr x15, x10, #45
-; CHECK-BE-NEXT:    lsr x13, x10, #40
-; CHECK-BE-NEXT:    ubfx x14, x10, #26, #14
-; CHECK-BE-NEXT:    orr w11, w12, w11, lsl #7
-; CHECK-BE-NEXT:    ldrh w12, [x0, #16]
-; CHECK-BE-NEXT:    fmov s0, w15
-; CHECK-BE-NEXT:    orr w13, w14, w13, lsl #14
-; CHECK-BE-NEXT:    ubfx x14, x9, #14, #18
+; CHECK-BE-NEXT:    ldrh w16, [x0, #16]
+; CHECK-BE-NEXT:    ldrb w17, [x0, #18]
 ; CHECK-BE-NEXT:    add x0, x0, #32
-; CHECK-BE-NEXT:    fmov s1, w11
-; CHECK-BE-NEXT:    orr w11, w16, w12, lsl #8
-; CHECK-BE-NEXT:    lsl x12, x9, #24
-; CHECK-BE-NEXT:    mov v0.s[1], w13
+; CHECK-BE-NEXT:    lsl x11, x9, #24
+; CHECK-BE-NEXT:    lsr x12, x9, #40
+; CHECK-BE-NEXT:    lsr x13, x10, #45
+; CHECK-BE-NEXT:    lsl x14, x10, #24
+; CHECK-BE-NEXT:    lsr x15, x10, #40
+; CHECK-BE-NEXT:    extr x12, x12, x11, #57
+; CHECK-BE-NEXT:    fmov s0, w13
 ; CHECK-BE-NEXT:    ubfx x13, x10, #7, #25
+; CHECK-BE-NEXT:    extr x14, x15, x14, #50
+; CHECK-BE-NEXT:    ubfx x15, x9, #14, #18
 ; CHECK-BE-NEXT:    extr x9, x10, x9, #40
-; CHECK-BE-NEXT:    orr w12, w11, w12
-; CHECK-BE-NEXT:    mov v1.s[1], w14
-; CHECK-BE-NEXT:    lsr w12, w12, #19
+; CHECK-BE-NEXT:    fmov s1, w12
+; CHECK-BE-NEXT:    orr w12, w17, w16, lsl #8
+; CHECK-BE-NEXT:    mov v0.s[1], w14
 ; CHECK-BE-NEXT:    ubfx x9, x9, #12, #20
+; CHECK-BE-NEXT:    orr w11, w12, w11
+; CHECK-BE-NEXT:    mov v1.s[1], w15
+; CHECK-BE-NEXT:    lsr w11, w11, #19
 ; CHECK-BE-NEXT:    mov v0.s[2], w13
-; CHECK-BE-NEXT:    mov v1.s[2], w12
+; CHECK-BE-NEXT:    mov v1.s[2], w11
 ; CHECK-BE-NEXT:    mov v0.s[3], w9
 ; CHECK-BE-NEXT:    add x9, x1, x8, lsl #3
 ; CHECK-BE-NEXT:    add x8, x8, #1
 ; CHECK-BE-NEXT:    cmp x8, #1000
-; CHECK-BE-NEXT:    mov v1.s[3], w11
+; CHECK-BE-NEXT:    mov v1.s[3], w12
 ; CHECK-BE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-BE-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-BE-NEXT:    st1 { v0.8b }, [x9]
@@ -650,35 +647,34 @@ define void @trunc_v8i19_to_v8i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-DISABLE-NEXT:  .LBB5_1: // %loop
 ; CHECK-DISABLE-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-DISABLE-NEXT:    ldp x10, x9, [x0]
-; CHECK-DISABLE-NEXT:    ldrb w16, [x0, #18]
-; CHECK-DISABLE-NEXT:    lsr x11, x9, #40
-; CHECK-DISABLE-NEXT:    ubfx x12, x9, #33, #7
-; CHECK-DISABLE-NEXT:    lsr x15, x10, #45
-; CHECK-DISABLE-NEXT:    lsr x13, x10, #40
-; CHECK-DISABLE-NEXT:    ubfx x14, x10, #26, #14
-; CHECK-DISABLE-NEXT:    orr w11, w12, w11, lsl #7
-; CHECK-DISABLE-NEXT:    ldrh w12, [x0, #16]
-; CHECK-DISABLE-NEXT:    fmov s0, w15
-; CHECK-DISABLE-NEXT:    orr w13, w14, w13, lsl #14
-; CHECK-DISABLE-NEXT:    ubfx x14, x9, #14, #18
+; CHECK-DISABLE-NEXT:    ldrh w16, [x0, #16]
+; CHECK-DISABLE-NEXT:    ldrb w17, [x0, #18]
 ; CHECK-DISABLE-NEXT:    add x0, x0, #32
-; CHECK-DISABLE-NEXT:    fmov s1, w11
-; CHECK-DISABLE-NEXT:    orr w11, w16, w12, lsl #8
-; CHECK-DISABLE-NEXT:    lsl x12, x9, #24
-; CHECK-DISABLE-NEXT:    mov v0.s[1], w13
+; CHECK-DISABLE-NEXT:    lsl x11, x9, #24
+; CHECK-DISABLE-NEXT:    lsr x12, x9, #40
+; CHECK-DISABLE-NEXT:    lsr x13, x10, #45
+; CHECK-DISABLE-NEXT:    lsl x14, x10, #24
+; CHECK-DISABLE-NEXT:    lsr x15, x10, #40
+; CHECK-DISABLE-NEXT:    extr x12, x12, x11, #57
+; CHECK-DISABLE-NEXT:    fmov s0, w13
 ; CHECK-DISABLE-NEXT:    ubfx x13, x10, #7, #25
+; CHECK-DISABLE-NEXT:    extr x14, x15, x14, #50
+; CHECK-DISABLE-NEXT:    ubfx x15, x9, #14, #18
 ; CHECK-DISABLE-NEXT:    extr x9, x10, x9, #40
-; CHECK-DISABLE-NEXT:    orr w12, w11, w12
-; CHECK-DISABLE-NEXT:    mov v1.s[1], w14
-; CHECK-DISABLE-NEXT:    lsr w12, w12, #19
+; CHECK-DISABLE-NEXT:    fmov s1, w12
+; CHECK-DISABLE-NEXT:    orr w12, w17, w16, lsl #8
+; CHECK-DISABLE-NEXT:    mov v0.s[1], w14
 ; CHECK-DISABLE-NEXT:    ubfx x9, x9, #12, #20
+; CHECK-DISABLE-NEXT:    orr w11, w12, w11
+; CHECK-DISABLE-NEXT:    mov v1.s[1], w15
+; CHECK-DISABLE-NEXT:    lsr w11, w11, #19
 ; CHECK-DISABLE-NEXT:    mov v0.s[2], w13
-; CHECK-DISABLE-NEXT:    mov v1.s[2], w12
+; CHECK-DISABLE-NEXT:    mov v1.s[2], w11
 ; CHECK-DISABLE-NEXT:    mov v0.s[3], w9
 ; CHECK-DISABLE-NEXT:    add x9, x1, x8, lsl #3
 ; CHECK-DISABLE-NEXT:    add x8, x8, #1
 ; CHECK-DISABLE-NEXT:    cmp x8, #1000
-; CHECK-DISABLE-NEXT:    mov v1.s[3], w11
+; CHECK-DISABLE-NEXT:    mov v1.s[3], w12
 ; CHECK-DISABLE-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-DISABLE-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-DISABLE-NEXT:    st1 { v0.8b }, [x9]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir
index ac98dca00be3..e3d31c702482 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap-gfx11.mir
@@ -1,18 +1,28 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1100 %s
-# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx11-generic --amdhsa-code-object-version=6 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1100 %s
-# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1150 -o - -run-pass=legalizer %s | FileCheck -check-prefix=GFX1150 %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -o - -run-pass=legalizer %s -verify-machineinstrs | FileCheck -check-prefix=GFX1100 %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx11-generic --amdhsa-code-object-version=6 -o - -run-pass=legalizer %s -verify-machineinstrs | FileCheck -check-prefix=GFX1100 %s
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1150 -o - -run-pass=legalizer %s -verify-machineinstrs | FileCheck -check-prefix=GFX1150 %s
 
 ---
 name: test_trap
 body: |
   bb.0:
     ; GFX1100-LABEL: name: test_trap
-    ; GFX1100: successors: %bb.2(0x80000000)
+    ; GFX1100: successors: %bb.1(0x40000000), %bb.2(0x40000000)
     ; GFX1100-NEXT: {{  $}}
     ; GFX1100-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX1100-NEXT: [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
     ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
+    ; GFX1100-NEXT: S_CBRANCH_EXECNZ %bb.2, implicit $exec
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: .1:
+    ; GFX1100-NEXT: successors:
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: .2:
+    ; GFX1100-NEXT: successors: %bb.3(0x80000000)
+    ; GFX1100-NEXT: {{  $}}
     ; GFX1100-NEXT: S_TRAP 2
     ; GFX1100-NEXT: [[S_SENDMSG_RTN_B32_:%[0-9]+]]:sreg_32 = S_SENDMSG_RTN_B32 128
     ; GFX1100-NEXT: $ttmp2 = S_MOV_B32 $m0
@@ -21,18 +31,13 @@ body: |
     ; GFX1100-NEXT: $m0 = S_MOV_B32 [[S_OR_B32_]]
     ; GFX1100-NEXT: S_SENDMSG 1, implicit $exec, implicit $m0
     ; GFX1100-NEXT: $m0 = S_MOV_B32 $ttmp2
-    ; GFX1100-NEXT: S_BRANCH %bb.2
-    ; GFX1100-NEXT: {{  $}}
-    ; GFX1100-NEXT: .1:
-    ; GFX1100-NEXT: successors:
+    ; GFX1100-NEXT: S_BRANCH %bb.3
     ; GFX1100-NEXT: {{  $}}
-    ; GFX1100-NEXT: G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
-    ; GFX1100-NEXT: {{  $}}
-    ; GFX1100-NEXT: .2:
-    ; GFX1100-NEXT: successors: %bb.2(0x80000000)
+    ; GFX1100-NEXT: .3:
+    ; GFX1100-NEXT: successors: %bb.3(0x80000000)
     ; GFX1100-NEXT: {{  $}}
     ; GFX1100-NEXT: S_SETHALT 5
-    ; GFX1100-NEXT: S_BRANCH %bb.2
+    ; GFX1100-NEXT: S_BRANCH %bb.3
     ;
     ; GFX1150-LABEL: name: test_trap
     ; GFX1150: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
@@ -45,5 +50,63 @@ body: |
     G_STORE %0, %1 :: (store 1, addrspace 1)
     G_TRAP
     G_STORE %0, %1 :: (store 1, addrspace 1)
+...
+
+---
+name: test_fallthrough_trap
+body: |
+  ; GFX1100-LABEL: name: test_fallthrough_trap
+  ; GFX1100: bb.0:
+  ; GFX1100-NEXT:   successors: %bb.1(0x80000000), %bb.2(0x00000000)
+  ; GFX1100-NEXT: {{  $}}
+  ; GFX1100-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX1100-NEXT:   [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
+  ; GFX1100-NEXT:   G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
+  ; GFX1100-NEXT:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
+  ; GFX1100-NEXT: {{  $}}
+  ; GFX1100-NEXT: bb.1:
+  ; GFX1100-NEXT:   successors:
+  ; GFX1100-NEXT: {{  $}}
+  ; GFX1100-NEXT:   G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
+  ; GFX1100-NEXT: {{  $}}
+  ; GFX1100-NEXT: bb.2:
+  ; GFX1100-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX1100-NEXT: {{  $}}
+  ; GFX1100-NEXT:   S_TRAP 2
+  ; GFX1100-NEXT:   [[S_SENDMSG_RTN_B32_:%[0-9]+]]:sreg_32 = S_SENDMSG_RTN_B32 128
+  ; GFX1100-NEXT:   $ttmp2 = S_MOV_B32 $m0
+  ; GFX1100-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_SENDMSG_RTN_B32_]], 1023, implicit-def $scc
+  ; GFX1100-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], 1024, implicit-def $scc
+  ; GFX1100-NEXT:   $m0 = S_MOV_B32 [[S_OR_B32_]]
+  ; GFX1100-NEXT:   S_SENDMSG 1, implicit $exec, implicit $m0
+  ; GFX1100-NEXT:   $m0 = S_MOV_B32 $ttmp2
+  ; GFX1100-NEXT:   S_BRANCH %bb.3
+  ; GFX1100-NEXT: {{  $}}
+  ; GFX1100-NEXT: bb.3:
+  ; GFX1100-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX1100-NEXT: {{  $}}
+  ; GFX1100-NEXT:   S_SETHALT 5
+  ; GFX1100-NEXT:   S_BRANCH %bb.3
+  ;
+  ; GFX1150-LABEL: name: test_fallthrough_trap
+  ; GFX1150: bb.0:
+  ; GFX1150-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX1150-NEXT: {{  $}}
+  ; GFX1150-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX1150-NEXT:   [[C1:%[0-9]+]]:_(p1) = G_CONSTANT i64 0
+  ; GFX1150-NEXT:   G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
+  ; GFX1150-NEXT:   S_TRAP 2
+  ; GFX1150-NEXT: {{  $}}
+  ; GFX1150-NEXT: bb.1:
+  ; GFX1150-NEXT:   G_STORE [[C]](s32), [[C1]](p1) :: (store (s8), addrspace 1)
+  bb.0:
+    successors: %bb.1
+
+    %0:_(s8) = G_CONSTANT i8 0
+    %1:_(p1) = G_CONSTANT i64 0
+    G_STORE %0, %1 :: (store 1, addrspace 1)
+    G_TRAP
 
+  bb.1:
+    G_STORE %0, %1 :: (store 1, addrspace 1)
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
index c105ad7590e6..7932f8d1fc5b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
@@ -302,7 +302,8 @@ define half @test_rootn_f16_neg1(half %x) {
 define half @test_rootn_f16_neg2(half %x) {
 ; CHECK-LABEL: define half @test_rootn_f16_neg2(
 ; CHECK-SAME: half [[X:%.*]]) {
-; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = call half @_Z5rsqrtDh(half [[X]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract half @llvm.sqrt.f16(half [[X]])
+; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = fdiv contract half 0xH3C00, [[TMP1]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret half [[__ROOTN2RSQRT]]
 ;
   %call = tail call half @_Z5rootnDhi(half %x, i32 -2)
@@ -371,7 +372,8 @@ define <2 x half> @test_rootn_v2f16_neg1(<2 x half> %x) {
 define <2 x half> @test_rootn_v2f16_neg2(<2 x half> %x) {
 ; CHECK-LABEL: define <2 x half> @test_rootn_v2f16_neg2(
 ; CHECK-SAME: <2 x half> [[X:%.*]]) {
-; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = call <2 x half> @_Z5rsqrtDv2_Dh(<2 x half> [[X]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> [[X]])
+; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = fdiv contract <2 x half> <half 0xH3C00, half 0xH3C00>, [[TMP1]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x half> [[__ROOTN2RSQRT]]
 ;
   %call = tail call <2 x half> @_Z5rootnDv2_DhDv2_i(<2 x half> %x, <2 x i32> <i32 -2, i32 -2>)
@@ -865,7 +867,8 @@ define float @test_rootn_f32__y_neg2(float %x) {
 ; CHECK-LABEL: define float @test_rootn_f32__y_neg2(
 ; CHECK-SAME: float [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = call float @_Z5rsqrtf(float [[X]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = fdiv contract float 1.000000e+00, [[TMP0]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[__ROOTN2RSQRT]]
 ;
 entry:
@@ -877,7 +880,8 @@ define float @test_rootn_f32__y_neg2__flags(float %x) {
 ; CHECK-LABEL: define float @test_rootn_f32__y_neg2__flags(
 ; CHECK-SAME: float [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = call nnan nsz float @_Z5rsqrtf(float [[X]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call nnan nsz contract float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = fdiv nnan nsz contract float 1.000000e+00, [[TMP0]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret float [[__ROOTN2RSQRT]]
 ;
 entry:
@@ -889,7 +893,7 @@ define float @test_rootn_f32__y_neg2__strictfp(float %x) #1 {
 ; CHECK-LABEL: define float @test_rootn_f32__y_neg2__strictfp(
 ; CHECK-SAME: float [[X:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = call float @_Z5rsqrtf(float [[X]]) #[[ATTR0]]
+; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -2) #[[ATTR0]]
 ; CHECK-NEXT:    ret float [[__ROOTN2RSQRT]]
 ;
 entry:
@@ -901,7 +905,7 @@ define float @test_rootn_f32__y_neg2__noinline(float %x) {
 ; CHECK-LABEL: define float @test_rootn_f32__y_neg2__noinline(
 ; CHECK-SAME: float [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = call float @_Z5rsqrtf(float [[X]])
+; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -2) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    ret float [[__ROOTN2RSQRT]]
 ;
 entry:
@@ -913,7 +917,7 @@ define float @test_rootn_f32__y_neg2__nobuiltin(float %x) {
 ; CHECK-LABEL: define float @test_rootn_f32__y_neg2__nobuiltin(
 ; CHECK-SAME: float [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -2) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -2) #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 entry:
@@ -925,7 +929,8 @@ define <2 x float> @test_rootn_v2f32__y_neg2(<2 x float> %x) {
 ; CHECK-LABEL: define <2 x float> @test_rootn_v2f32__y_neg2(
 ; CHECK-SAME: <2 x float> [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = call <2 x float> @_Z5rsqrtDv2_f(<2 x float> [[X]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]])
+; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = fdiv contract <2 x float> <float 1.000000e+00, float 1.000000e+00>, [[TMP0]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[__ROOTN2RSQRT]]
 ;
 entry:
@@ -937,7 +942,8 @@ define <2 x float> @test_rootn_v2f32__y_neg2__flags(<2 x float> %x) {
 ; CHECK-LABEL: define <2 x float> @test_rootn_v2f32__y_neg2__flags(
 ; CHECK-SAME: <2 x float> [[X:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = call nnan nsz <2 x float> @_Z5rsqrtDv2_f(<2 x float> [[X]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call nnan nsz contract <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]])
+; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = fdiv nnan nsz contract <2 x float> <float 1.000000e+00, float 1.000000e+00>, [[TMP0]], !fpmath [[META0]]
 ; CHECK-NEXT:    ret <2 x float> [[__ROOTN2RSQRT]]
 ;
 entry:
@@ -949,7 +955,7 @@ define <2 x float> @test_rootn_v2f32__y_neg2__strictfp(<2 x float> %x) #1 {
 ; CHECK-LABEL: define <2 x float> @test_rootn_v2f32__y_neg2__strictfp(
 ; CHECK-SAME: <2 x float> [[X:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = call <2 x float> @_Z5rsqrtDv2_f(<2 x float> [[X]]) #[[ATTR0]]
+; CHECK-NEXT:    [[__ROOTN2RSQRT:%.*]] = tail call <2 x float> @_Z5rootnDv2_fDv2_i(<2 x float> [[X]], <2 x i32> <i32 -2, i32 -2>) #[[ATTR0]]
 ; CHECK-NEXT:    ret <2 x float> [[__ROOTN2RSQRT]]
 ;
 entry:
@@ -1125,7 +1131,7 @@ define float @test_rootn_fast_f32_nobuiltin(float %x, i32 %y) {
 ; CHECK-LABEL: define float @test_rootn_fast_f32_nobuiltin(
 ; CHECK-SAME: float [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call fast float @_Z5rootnfi(float [[X]], i32 [[Y]]) #[[ATTR3]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call fast float @_Z5rootnfi(float [[X]], i32 [[Y]]) #[[ATTR4]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
 entry:
@@ -1420,7 +1426,7 @@ entry:
 define float @test_rootn_f32__y_0_nobuiltin(float %x) {
 ; CHECK-LABEL: define float @test_rootn_f32__y_0_nobuiltin(
 ; CHECK-SAME: float [[X:%.*]]) {
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 0) #[[ATTR3]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 0) #[[ATTR4]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
   %call = tail call float @_Z5rootnfi(float %x, i32 0) #0
@@ -1430,7 +1436,7 @@ define float @test_rootn_f32__y_0_nobuiltin(float %x) {
 define float @test_rootn_f32__y_1_nobuiltin(float %x) {
 ; CHECK-LABEL: define float @test_rootn_f32__y_1_nobuiltin(
 ; CHECK-SAME: float [[X:%.*]]) {
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 1) #[[ATTR3]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 1) #[[ATTR4]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
   %call = tail call float @_Z5rootnfi(float %x, i32 1) #0
@@ -1440,7 +1446,7 @@ define float @test_rootn_f32__y_1_nobuiltin(float %x) {
 define float @test_rootn_f32__y_2_nobuiltin(float %x) {
 ; CHECK-LABEL: define float @test_rootn_f32__y_2_nobuiltin(
 ; CHECK-SAME: float [[X:%.*]]) {
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 2) #[[ATTR3]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 2) #[[ATTR4]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
   %call = tail call float @_Z5rootnfi(float %x, i32 2) #0
@@ -1450,7 +1456,7 @@ define float @test_rootn_f32__y_2_nobuiltin(float %x) {
 define float @test_rootn_f32__y_3_nobuiltin(float %x) {
 ; CHECK-LABEL: define float @test_rootn_f32__y_3_nobuiltin(
 ; CHECK-SAME: float [[X:%.*]]) {
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 3) #[[ATTR3]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 3) #[[ATTR4]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
   %call = tail call float @_Z5rootnfi(float %x, i32 3) #0
@@ -1460,7 +1466,7 @@ define float @test_rootn_f32__y_3_nobuiltin(float %x) {
 define float @test_rootn_f32__y_neg1_nobuiltin(float %x) {
 ; CHECK-LABEL: define float @test_rootn_f32__y_neg1_nobuiltin(
 ; CHECK-SAME: float [[X:%.*]]) {
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -1) #[[ATTR3]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -1) #[[ATTR4]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
   %call = tail call float @_Z5rootnfi(float %x, i32 -1) #0
@@ -1470,7 +1476,7 @@ define float @test_rootn_f32__y_neg1_nobuiltin(float %x) {
 define float @test_rootn_f32__y_neg2_nobuiltin(float %x) {
 ; CHECK-LABEL: define float @test_rootn_f32__y_neg2_nobuiltin(
 ; CHECK-SAME: float [[X:%.*]]) {
-; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -2) #[[ATTR3]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call float @_Z5rootnfi(float [[X]], i32 -2) #[[ATTR4]]
 ; CHECK-NEXT:    ret float [[CALL]]
 ;
   %call = tail call float @_Z5rootnfi(float %x, i32 -2) #0
@@ -1487,7 +1493,8 @@ attributes #2 = { noinline }
 ; CHECK: attributes #[[ATTR0]] = { strictfp }
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind memory(read) }
-; CHECK: attributes #[[ATTR3]] = { nobuiltin }
+; CHECK: attributes #[[ATTR3]] = { noinline }
+; CHECK: attributes #[[ATTR4]] = { nobuiltin }
 ;.
 ; CHECK: [[META0]] = !{float 2.000000e+00}
 ; CHECK: [[META1]] = !{float 3.000000e+00}
diff --git a/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll b/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll
new file mode 100644
index 000000000000..0c4974f347a8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -simplify-mir -stop-after=finalize-isel < %s | FileCheck %s
+
+; Check that call / asm get an implicit-def $mode added to them in
+; strictfp functions.
+
+declare protected void @maybe_defs_mode() #0
+
+define float @call_changes_mode(float %x, float %y) #0 {
+  ; CHECK-LABEL: name: call_changes_mode
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $mode
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   $vgpr0 = COPY [[V_ADD_F32_e64_]]
+  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  call void @maybe_defs_mode()
+  %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+  ret float %val
+}
+
+define void @tail_call_changes_mode() #0 {
+  ; CHECK-LABEL: name: tail_call_changes_mode
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:ccr_sgpr_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def dead $scc
+  ; CHECK-NEXT:   SI_TCRETURN killed [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, 0, csr_amdgpu, implicit-def $mode
+  tail call void @maybe_defs_mode()
+  ret void
+}
+
+define float @asm_changes_mode(float %x, float %y) #0 {
+  ; CHECK-LABEL: name: asm_changes_mode
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   INLINEASM &"; maybe defs mode", 1 /* sideeffect attdialect */, implicit-def $mode
+  ; CHECK-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   $vgpr0 = COPY [[V_ADD_F32_e64_]]
+  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  call void asm sideeffect "; maybe defs mode", ""()
+  %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+  ret float %val
+}
+
+declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
+
+attributes #0 = { strictfp "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z"  }
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll
index cfc166ec798f..5162092f78ac 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll
@@ -47,6 +47,21 @@ define amdgpu_kernel void @dpp_fadd(ptr addrspace(1) %arg) {
   ret void
 }
 
+; Fails to combine because v_mul_lo_u32 has no e32 or dpp form.
+; GCN-LABEL: {{^}}dpp_mul:
+; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],
+; GCN: v_mov_b32_e32 [[V2:v[0-9]+]], [[V]]
+; GCN: v_mov_b32_dpp [[V2]], [[V2]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
+; GCN: v_mul_lo_u32 [[V]], [[V2]], [[V]]{{$}}
+define amdgpu_kernel void @dpp_mul(ptr addrspace(1) %arg) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %id
+  %load = load i32, ptr addrspace(1) %gep
+  %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %load, i32 %load, i32 1, i32 15, i32 15, i1 1)
+  %mul = mul i32 %tmp0, %load
+  store i32 %mul, ptr addrspace(1) %gep
+  ret void
+}
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 6e45084dc4b8..9690e126dfcf 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -1,98 +1,3251 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
-
-; GCN-LABEL: {{^}}test_fmaximum3_olt_0_f32:
-; GCN: buffer_load_b32 [[REGC:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGB:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGA:v[0-9]+]]
-; GCN: v_maximum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b32 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile float, ptr addrspace(1) %aptr, align 4
-  %b = load volatile float, ptr addrspace(1) %bptr, align 4
-  %c = load volatile float, ptr addrspace(1) %cptr, align 4
-  %f0 = call float @llvm.maximum.f32(float %a, float %b)
-  %f1 = call float @llvm.maximum.f32(float %f0, float %c)
-  store float %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-; Commute operand of second fmaximum
-; GCN-LABEL: {{^}}test_fmaximum3_olt_1_f32:
-; GCN: buffer_load_b32 [[REGB:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGA:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGC:v[0-9]+]]
-; GCN: v_maximum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b32 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile float, ptr addrspace(1) %aptr, align 4
-  %b = load volatile float, ptr addrspace(1) %bptr, align 4
-  %c = load volatile float, ptr addrspace(1) %cptr, align 4
-  %f0 = call float @llvm.maximum.f32(float %a, float %b)
-  %f1 = call float @llvm.maximum.f32(float %c, float %f0)
-  store float %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_fmaximum3_olt_0_f16:
-; GCN: buffer_load_u16 [[REGC:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGB:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGA:v[0-9]+]]
-; GCN: v_maximum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b16 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile half, ptr addrspace(1) %aptr, align 2
-  %b = load volatile half, ptr addrspace(1) %bptr, align 2
-  %c = load volatile half, ptr addrspace(1) %cptr, align 2
-  %f0 = call half @llvm.maximum.f16(half %a, half %b)
-  %f1 = call half @llvm.maximum.f16(half %f0, half %c)
-  store half %f1, ptr addrspace(1) %out, align 2
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_fmaximum3_olt_1_f16:
-; GCN: buffer_load_u16 [[REGA:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGB:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGC:v[0-9]+]]
-; GCN: v_maximum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]]
-; GCN: buffer_store_b16 [[RESULT]],
-define amdgpu_kernel void @test_fmaximum3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile half, ptr addrspace(1) %aptr, align 2
-  %b = load volatile half, ptr addrspace(1) %bptr, align 2
-  %c = load volatile half, ptr addrspace(1) %cptr, align 2
-  %f0 = call half @llvm.maximum.f16(half %a, half %b)
-  %f1 = call half @llvm.maximum.f16(half %c, half %f0)
-  store half %f1, ptr addrspace(1) %out, align 2
-  ret void
-}
-
-; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of maximum3
-; since there are no pack instructions for fmaximum3.
-; GCN-LABEL: {{^}}no_fmaximum3_v2f16:
-; GCN: v_pk_maximum_f16 v0, v0, v1
-; GCN: v_pk_maximum_f16 v0, v2, v0
-; GCN: v_pk_maximum_f16 v0, v0, v3
-; GCN-NEXT: s_setpc_b64
-define <2 x half> @no_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
-entry:
-  %max = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
-  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max)
-  %res = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max1, <2 x half> %d)
-  ret <2 x half> %res
-}
-
-; GCN-LABEL: {{^}}no_fmaximum3_olt_0_f64:
-; GCN-COUNT-2: v_maximum_f64
-define amdgpu_kernel void @no_fmaximum3_olt_0_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile double, ptr addrspace(1) %aptr, align 4
-  %b = load volatile double, ptr addrspace(1) %bptr, align 4
-  %c = load volatile double, ptr addrspace(1) %cptr, align 4
-  %f0 = call double @llvm.maximum.f64(double %a, double %b)
-  %f1 = call double @llvm.maximum.f64(double %f0, double %c)
-  store double %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-declare double @llvm.maximum.f64(double, double)
-declare float @llvm.maximum.f32(float, float)
-declare half @llvm.maximum.f16(half, half)
-declare <2 x half> @llvm.maximum.v2f16(<2 x half>, <2 x half>)
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+
+define float @v_fmaximum3_f32(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_commute(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v2, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v2, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %c, float %max0)
+  ret float %max1
+}
+
+define amdgpu_ps i32 @s_fmaximum3_f32(float inreg %a, float inreg %b, float inreg %c) {
+; GFX12-LABEL: s_fmaximum3_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum3_f32 v0, s0, s1, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fmaximum3_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_max_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, s2, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  %cast = bitcast float %max1 to i32
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+  ret i32 %readfirstlane
+}
+
+define float @v_fmaximum3_f32_fabs0(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, |v0|, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fabs0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, |v0|, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %max0 = call float @llvm.maximum.f32(float %a.fabs, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fabs1(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, |v1|, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fabs1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, v0, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %max0 = call float @llvm.maximum.f32(float %a, float %b.fabs)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fabs2(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fabs2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fabs)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fabs_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, |v0|, |v1|, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, |v0|, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %max0 = call float @llvm.maximum.f32(float %a.fabs, float %b.fabs)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fabs)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, -v0, -v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, -v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg float %a
+  %b.fneg = fneg float %b
+  %c.fneg = fneg float %c
+  %max0 = call float @llvm.maximum.f32(float %a.fneg, float %b.fneg)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg_fabs_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, -|v0|, -|v1|, -|v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, -|v0|, -|v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -|v0|, -|v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e64 v1, v0, -|v2|
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -|v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %a.fneg.fabs = fneg float %a.fabs
+  %b.fneg.fabs = fneg float %b.fabs
+  %c.fneg.fabs = fneg float %c.fabs
+  %max0 = call float @llvm.maximum.f32(float %a.fneg.fabs, float %b.fneg.fabs)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg.fabs)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg0(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, -v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, -v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg float %a
+  %max0 = call float @llvm.maximum.f32(float %a.fneg, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg1(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, -v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e64 v3, v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fneg = fneg float %b
+  %max0 = call float @llvm.maximum.f32(float %a, float %b.fneg)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_fneg2(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_fneg2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_fneg2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f32_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fneg = fneg float %c
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_const0(float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_const0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, 0x41000000, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_const0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v2, 0x41000000, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float 8.0, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32__const2(float %a, float %b) {
+; GFX12-LABEL: v_fmaximum3_f32__const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, 0x41000000
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32__const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, 0x41000000, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float 8.0)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_inlineimm0(float %b, float %c) {
+; GFX12-LABEL: v_fmaximum3_f32_inlineimm0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, 4.0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_inlineimm0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v2, 4.0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float 4.0, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32__inlineimm(float %a, float %b) {
+; GFX12-LABEL: v_fmaximum3_f32__inlineimm:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32__inlineimm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, 4.0, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float %b)
+  %max1 = call float @llvm.maximum.f32(float %max0, float 4.0)
+  ret float %max1
+}
+
+define float @v_fmaximum3_f32_const1_const2(float %a) {
+; GFX12-LABEL: v_fmaximum3_f32_const1_const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s0, 0x41000000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, s0, 0x41800000
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f32_const1_const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v1, 0x41000000, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_max_f32_e32 v1, 0x41800000, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.maximum.f32(float %a, float 8.0)
+  %max1 = call float @llvm.maximum.f32(float %max0, float 16.0)
+  ret float %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v4, v0, v2
+; GFX12-NEXT:    v_maximum3_f32 v1, v5, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v5, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v5, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %c, <2 x float> %max0)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v2, v4
+; GFX12-NEXT:    v_maximum3_f32 v1, v1, v3, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f32_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, |v0|, |v2|, |v4|
+; GFX12-NEXT:    v_maximum3_f32 v1, |v1|, |v3|, |v5|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f32__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v7, |v6|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v1|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v6, |v1|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v3|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v3|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, |v3|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v0|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, |v0|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v2|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v2|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v0, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, |v4|, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v4|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v4|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v1, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, |v5|, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v5|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v5|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %b.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %b)
+  %c.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %c)
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a.fabs, <2 x float> %b.fabs)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c.fabs)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, -v0, -v2, -v4
+; GFX12-NEXT:    v_maximum3_f32 v1, -v1, -v3, -v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f32__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, -v1, -v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v1, -v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v7, -v6, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v6, -v1, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v3, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, -v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, -v3, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, -v0, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v2, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v0, -v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, -v4, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v4, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v1, -v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, -v5, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v5, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <2 x float> %a
+  %b.fneg = fneg <2 x float> %b
+  %c.fneg = fneg <2 x float> %c
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a.fneg, <2 x float> %b.fneg)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c.fneg)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f32__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, 2.0, v2
+; GFX12-NEXT:    v_maximum3_f32 v1, v1, 2.0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f32__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 2.0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 2.0, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 2.0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 2.0, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> <float 2.0, float 2.0>)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fmaximum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b) {
+; GFX12-LABEL: v_fmaximum3_v2f32__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v2, 4.0
+; GFX12-NEXT:    v_maximum3_f32 v1, v1, v3, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f32__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 4.0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 4.0, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 4.0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 4.0, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
+  %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> <float 4.0, float 4.0>)
+  ret <2 x float> %max1
+}
+
+define <3 x float> @v_fmaximum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v6, v0, v3
+; GFX12-NEXT:    v_maximum3_f32 v1, v7, v1, v4
+; GFX12-NEXT:    v_maximum3_f32 v2, v8, v2, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v10, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v6, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v6, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v6, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v7, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v7, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v7, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v8, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
+  %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %c, <3 x float> %max0)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fmaximum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f32_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v3, v6
+; GFX12-NEXT:    v_maximum3_f32 v1, v1, v4, v7
+; GFX12-NEXT:    v_maximum3_f32 v2, v2, v5, v8
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f32_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v10, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v6, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v7, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v8, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
+  %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> %c)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fmaximum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f32__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, |v0|, |v3|, |v6|
+; GFX12-NEXT:    v_maximum3_f32 v1, |v1|, |v4|, |v7|
+; GFX12-NEXT:    v_maximum3_f32 v2, |v2|, |v5|, |v8|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f32__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, |v2|, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v2|, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v10, |v9|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v2|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v9, |v2|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v5|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, |v5|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v1|, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v10, |v5|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v1|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, |v1|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v4|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v4|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v0|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, |v4|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v0|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, |v0|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v3|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v3|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v0, |v6|
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, |v6|, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v6|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v6|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v6|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v1, |v7|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, |v7|, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v7|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v7|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v7|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v2, |v8|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, |v8|, v2, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v2, |v8|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v8|, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, |v8|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a)
+  %b.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %b)
+  %c.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %c)
+  %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a.fabs, <3 x float> %b.fabs)
+  %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> %c.fabs)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fmaximum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f32__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, -v0, -v3, -v6
+; GFX12-NEXT:    v_maximum3_f32 v1, -v1, -v4, -v7
+; GFX12-NEXT:    v_maximum3_f32 v2, -v2, -v5, -v8
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f32__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, -v2, -v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v2, -v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v10, -v9, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v9, -v2, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -v5, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, -v1, -v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v1, -v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v10, -v5, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v4, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 vcc, -v0, -v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v0, -v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, -v4, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, -v0, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v3, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v0, -v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, -v6, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v6, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v6, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v1, -v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, -v7, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v7, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v7, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], v2, -v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, -v8, v2, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v2, -v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v8, 64
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -v8, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <3 x float> %a
+  %b.fneg = fneg <3 x float> %b
+  %c.fneg = fneg <3 x float> %c
+  %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a.fneg, <3 x float> %b.fneg)
+  %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> %c.fneg)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fmaximum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f32__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, 2.0, v3
+; GFX12-NEXT:    v_maximum3_f32 v1, v1, 2.0, v4
+; GFX12-NEXT:    v_maximum3_f32 v2, v2, 2.0, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f32__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 2.0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 2.0, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 2.0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 2.0, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 2.0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 2.0, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> <float 2.0, float 2.0, float 2.0>)
+  %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> %c)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fmaximum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b) {
+; GFX12-LABEL: v_fmaximum3_v3f32__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f32 v0, v0, v3, 4.0
+; GFX12-NEXT:    v_maximum3_f32 v1, v1, v4, 4.0
+; GFX12-NEXT:    v_maximum3_f32 v2, v2, v5, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f32__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 64
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 4.0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 4.0, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 4.0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 4.0, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 4.0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 4.0, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
+  %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> <float 4.0, float 4.0, float 4.0>)
+  ret <3 x float> %max1
+}
+
+
+define half @v_fmaximum3_f16(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_commute(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v2, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, v2, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %c, half %max0)
+  ret half %max1
+}
+
+define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg %c) {
+; GFX12-LABEL: s_fmaximum3_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum3_f16 v0, s0, s1, v0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fmaximum3_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_max_f16_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, s2, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  %cast = bitcast half %max1 to i16
+  %zext = zext i16 %cast to i32
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readfirstlane
+}
+
+define half @v_fmaximum3_f16_fabs0(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fabs0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, |v0|, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fabs0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v3, |v0|, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call half @llvm.fabs.f16(half %a)
+  %max0 = call half @llvm.maximum.f16(half %a.fabs, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fabs1(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fabs1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, |v1|, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fabs1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v3, v0, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fabs = call half @llvm.fabs.f16(half %b)
+  %max0 = call half @llvm.maximum.f16(half %a, half %b.fabs)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fabs2(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fabs2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fabs2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fabs = call half @llvm.fabs.f16(half %c)
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c.fabs)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fabs_all(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, |v0|, |v1|, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v3, |v0|, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call half @llvm.fabs.f16(half %a)
+  %b.fabs = call half @llvm.fabs.f16(half %b)
+  %c.fabs = call half @llvm.fabs.f16(half %c)
+  %max0 = call half @llvm.maximum.f16(half %a.fabs, half %b.fabs)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c.fabs)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fneg_all(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, -v0, -v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v3, -v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg half %a
+  %b.fneg = fneg half %b
+  %c.fneg = fneg half %c
+  %max0 = call half @llvm.maximum.f16(half %a.fneg, half %b.fneg)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c.fneg)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fneg_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, -|v0|, -|v1|, -|v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fneg_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v3, -|v0|, -|v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -|v0|, -|v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e64 v1, v0, -|v2|
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -|v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call half @llvm.fabs.f16(half %a)
+  %b.fabs = call half @llvm.fabs.f16(half %b)
+  %c.fabs = call half @llvm.fabs.f16(half %c)
+  %a.fneg.fabs = fneg half %a.fabs
+  %b.fneg.fabs = fneg half %b.fabs
+  %c.fneg.fabs = fneg half %c.fabs
+  %max0 = call half @llvm.maximum.f16(half %a.fneg.fabs, half %b.fneg.fabs)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c.fneg.fabs)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fneg0(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fneg0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, -v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fneg0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v3, -v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg half %a
+  %max0 = call half @llvm.maximum.f16(half %a.fneg, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fneg1(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fneg1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, -v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fneg1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e64 v3, v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fneg = fneg half %b
+  %max0 = call half @llvm.maximum.f16(half %a, half %b.fneg)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_fneg2(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_fneg2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_fneg2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_max_f16_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fneg = fneg half %c
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c.fneg)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_const0(half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_const0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, 0x4800, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_const0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v2, 0x4800, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half 8.0, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16__const2(half %a, half %b) {
+; GFX12-LABEL: v_fmaximum3_f16__const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, 0x4800
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16__const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, 0x4800, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half 8.0)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_inlineimm0(half %b, half %c) {
+; GFX12-LABEL: v_fmaximum3_f16_inlineimm0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, 4.0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_inlineimm0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v2, 4.0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half 4.0, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16__inlineimm(half %a, half %b) {
+; GFX12-LABEL: v_fmaximum3_f16__inlineimm:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, v1, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16__inlineimm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, 4.0, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half %a, half %b)
+  %max1 = call half @llvm.maximum.f16(half %max0, half 4.0)
+  ret half %max1
+}
+
+define half @v_fmaximum3_f16_const1_const2(half %a) {
+; GFX12-LABEL: v_fmaximum3_f16_const1_const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_movk_i32 s0, 0x4800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_maximum3_f16 v0, v0, s0, 0x4c00
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f16_const1_const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v1, 0x4800, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_max_f16_e32 v1, 0x4c00, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.maximum.f16(half %a, half 8.0)
+  %max1 = call half @llvm.maximum.f16(half %max0, half 16.0)
+  ret half %max1
+}
+
+define <2 x half> @v_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v2, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v2, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max0)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fmaximum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f16_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fmaximum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f16__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
+; GFX12-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f16__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v1
+; GFX9-NEXT:    v_pk_max_f16 v3, v3, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v2
+; GFX9-NEXT:    v_perm_b32 v1, v4, v0, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
+  %b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b)
+  %c.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %c)
+  %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a.fabs, <2 x half> %b.fabs)
+  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c.fabs)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fmaximum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f16__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f16__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v5, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <2 x half> %a
+  %b.fneg = fneg <2 x half> %b
+  %c.fneg = fneg <2 x half> %c
+  %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a.fneg, <2 x half> %b.fneg)
+  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c.fneg)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fmaximum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v2f16__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f16__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v2, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v3, v0, s4
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 2.0>)
+  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fmaximum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) {
+; GFX12-LABEL: v_fmaximum3_v2f16__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v2f16__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v0, v4, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> <half 4.0, half 4.0>)
+  ret <2 x half> %max1
+}
+
+define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v4, v0
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v5, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v6, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v5, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v6
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v4, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
+  %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %c, <3 x half> %max0)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fmaximum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f16_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v6, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v5
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v8, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
+  %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fmaximum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f16__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
+; GFX12-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX12-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v5
+; GFX12-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f16__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v7, 0x7fff7fff, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0x7fff7fff, v3
+; GFX9-NEXT:    v_and_b32_e32 v6, 0x7fff7fff, v0
+; GFX9-NEXT:    v_and_b32_e32 v8, 0x7fff7fff, v2
+; GFX9-NEXT:    v_pk_max_f16 v7, v7, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_pk_max_f16 v6, v6, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v6, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_and_b32_e32 v11, 0x7fff7fff, v4
+; GFX9-NEXT:    v_perm_b32 v2, v8, v0, s4
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v11
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x7fff7fff, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_perm_b32 v6, v9, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX9-NEXT:    v_pk_max_f16 v6, v6, v10
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v1, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v6, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a)
+  %b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b)
+  %c.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %c)
+  %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a.fabs, <3 x half> %b.fabs)
+  %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c.fabs)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fmaximum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f16__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f16__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v1, -v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v6, -v5
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v8, -v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <3 x half> %a
+  %b.fneg = fneg <3 x half> %b
+  %c.fneg = fneg <3 x half> %c
+  %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a.fneg, <3 x half> %b.fneg)
+  %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c.fneg)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fmaximum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v3f16__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, 2.0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f16__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX9-NEXT:    v_pk_max_f16 v7, v1, 2.0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX9-NEXT:    s_mov_b32 s5, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v4, v5, v0, s5
+; GFX9-NEXT:    v_pk_max_f16 v4, v4, v2
+; GFX9-NEXT:    s_movk_i32 s4, 0x7e00
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX9-NEXT:    v_pack_b32_f16 v7, v1, s4
+; GFX9-NEXT:    v_pk_max_f16 v7, v7, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> <half 2.0, half 2.0, half 2.0>)
+  %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fmaximum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) {
+; GFX12-LABEL: v_fmaximum3_v3f16__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v3f16__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v4, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX9-NEXT:    v_pk_max_f16 v1, v1, 4.0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
+; GFX9-NEXT:    v_perm_b32 v2, v0, v6, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b)
+  %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> <half 4.0, half 4.0, half 4.0>)
+  ret <3 x half> %max1
+}
+
+define <4 x half> @v_fmaximum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v4f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v4, v0
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v5, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v6, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s4
+; GFX9-NEXT:    v_pk_max_f16 v2, v5, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_pk_max_f16 v2, v4, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
+  %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %c, <4 x half> %max0)
+  ret <4 x half> %max1
+}
+
+define <4 x half> @v_fmaximum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v4f16_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v4f16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v6, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s4
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v5
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v4
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v8, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
+  %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c)
+  ret <4 x half> %max1
+}
+
+define <4 x half> @v_fmaximum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v4f16__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
+; GFX12-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX12-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v5
+; GFX12-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v4f16__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v7, 0x7fff7fff, v0
+; GFX9-NEXT:    v_and_b32_e32 v9, 0x7fff7fff, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 0x7fff7fff, v1
+; GFX9-NEXT:    v_and_b32_e32 v8, 0x7fff7fff, v3
+; GFX9-NEXT:    v_pk_max_f16 v7, v7, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_pk_max_f16 v6, v6, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v6, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_and_b32_e32 v11, 0x7fff7fff, v5
+; GFX9-NEXT:    v_perm_b32 v2, v8, v1, s4
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x7fff7fff, v4
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v11
+; GFX9-NEXT:    v_perm_b32 v6, v9, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_pk_max_f16 v6, v6, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v12, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v1, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v6, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v7, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %a)
+  %b.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %b)
+  %c.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %c)
+  %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a.fabs, <4 x half> %b.fabs)
+  %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c.fabs)
+  ret <4 x half> %max1
+}
+
+define <4 x half> @v_fmaximum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v4f16__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v4f16__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v1, -v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s4
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v6, -v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v8, -v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <4 x half> %a
+  %b.fneg = fneg <4 x half> %b
+  %c.fneg = fneg <4 x half> %c
+  %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a.fneg, <4 x half> %b.fneg)
+  %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c.fneg)
+  ret <4 x half> %max1
+}
+
+define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) {
+; GFX12-LABEL: v_fmaximum3_v4f16__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, 2.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v4f16__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_pk_max_f16 v7, v1, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v6, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v4, v8, v1, s4
+; GFX9-NEXT:    v_pk_max_f16 v4, v4, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_perm_b32 v8, v5, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX9-NEXT:    v_pk_max_f16 v8, v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v6, v7, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v9, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v7, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>)
+  %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c)
+  ret <4 x half> %max1
+}
+
+define <4 x half> @v_fmaximum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) {
+; GFX12-LABEL: v_fmaximum3_v4f16__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_v4f16__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_max_f16 v4, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v1, v4, s4
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v6, s4
+; GFX9-NEXT:    v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
+  %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> <half 4.0, half 4.0, half 4.0, half 4.0>)
+  ret <4 x half> %max1
+}
+
+define double @v_fmaximum3_f64(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_commute(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[4:5], v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[4:5], v[0:1]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %c, double %max0)
+  ret double %max1
+}
+
+define amdgpu_ps <2 x i32> @s_fmaximum3_f64(double inreg %a, double inreg %b, double inreg %c) {
+; GFX12-LABEL: s_fmaximum3_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], s[4:5]
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fmaximum3_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  %cast = bitcast double %max1 to <2 x i32>
+  %elt0 = extractelement <2 x i32> %cast, i32 0
+  %elt1 = extractelement <2 x i32> %cast, i32 1
+  %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
+  %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
+  %insert.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
+  %insert.1 = insertelement <2 x i32> %insert.0, i32 %readlane1, i32 1
+  ret <2 x i32> %insert.1
+}
+
+define double @v_fmaximum3_f64_fabs0(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fabs0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], |v[0:1]|, v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fabs0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], |v[0:1]|, v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call double @llvm.fabs.f64(double %a)
+  %max0 = call double @llvm.maximum.f64(double %a.fabs, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fabs1(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fabs1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], |v[2:3]|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fabs1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], |v[2:3]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]|
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fabs = call double @llvm.fabs.f64(double %b)
+  %max0 = call double @llvm.maximum.f64(double %a, double %b.fabs)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fabs2(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fabs2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], |v[4:5]|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fabs2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fabs = call double @llvm.fabs.f64(double %c)
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c.fabs)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fabs_all(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], |v[0:1]|, |v[2:3]|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], |v[4:5]|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], |v[0:1]|, |v[2:3]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]|
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call double @llvm.fabs.f64(double %a)
+  %b.fabs = call double @llvm.fabs.f64(double %b)
+  %c.fabs = call double @llvm.fabs.f64(double %c)
+  %max0 = call double @llvm.maximum.f64(double %a.fabs, double %b.fabs)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c.fabs)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fneg_all(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], -v[0:1], -v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], -v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], -v[0:1], -v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg double %a
+  %b.fneg = fneg double %b
+  %c.fneg = fneg double %c
+  %max0 = call double @llvm.maximum.f64(double %a.fneg, double %b.fneg)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c.fneg)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fneg_fabs_all(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fneg_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], -|v[0:1]|, -|v[2:3]|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], -|v[4:5]|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fneg_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], -|v[0:1]|, -|v[2:3]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]|
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], -|v[4:5]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -|v[4:5]|
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call double @llvm.fabs.f64(double %a)
+  %b.fabs = call double @llvm.fabs.f64(double %b)
+  %c.fabs = call double @llvm.fabs.f64(double %c)
+  %a.fneg.fabs = fneg double %a.fabs
+  %b.fneg.fabs = fneg double %b.fabs
+  %c.fneg.fabs = fneg double %c.fabs
+  %max0 = call double @llvm.maximum.f64(double %a.fneg.fabs, double %b.fneg.fabs)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c.fneg.fabs)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fneg0(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fneg0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], -v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fneg0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], -v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg double %a
+  %max0 = call double @llvm.maximum.f64(double %a.fneg, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fneg1(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fneg1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], -v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fneg1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], -v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fneg = fneg double %b
+  %max0 = call double @llvm.maximum.f64(double %a, double %b.fneg)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_fneg2(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_fneg2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], -v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_fneg2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fneg = fneg double %c
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c.fneg)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_const0(double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_const0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], 0x40200000, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_const0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double 8.0, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64__const2(double %a, double %b) {
+; GFX12-LABEL: v_fmaximum3_f64__const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], 0x40200000, v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64__const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double 8.0)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_inlineimm0(double %b, double %c) {
+; GFX12-LABEL: v_fmaximum3_f64_inlineimm0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], 4.0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_inlineimm0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], 4.0
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double 4.0, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64__inlineimm(double %a, double %b) {
+; GFX12-LABEL: v_fmaximum3_f64__inlineimm:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64__inlineimm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], 4.0
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double %a, double %b)
+  %max1 = call double @llvm.maximum.f64(double %max0, double 4.0)
+  ret double %max1
+}
+
+define double @v_fmaximum3_f64_const1_const2(double %a) {
+; GFX12-LABEL: v_fmaximum3_f64_const1_const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_maximum_f64 v[0:1], 0x40200000, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_maximum_f64 v[0:1], 0x40300000, v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fmaximum3_f64_const1_const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40300000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_max_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.maximum.f64(double %a, double 8.0)
+  %max1 = call double @llvm.maximum.f64(double %max0, double 16.0)
+  ret double %max1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index eef271e69a38..7481fff251d8 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -1,98 +1,3251 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
-
-; GCN-LABEL: {{^}}test_fminimum3_olt_0_f32:
-; GCN: buffer_load_b32 [[REGC:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGB:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGA:v[0-9]+]]
-; GCN: v_minimum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b32 [[RESULT]],
-define amdgpu_kernel void @test_fminimum3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile float, ptr addrspace(1) %aptr, align 4
-  %b = load volatile float, ptr addrspace(1) %bptr, align 4
-  %c = load volatile float, ptr addrspace(1) %cptr, align 4
-  %f0 = call float @llvm.minimum.f32(float %a, float %b)
-  %f1 = call float @llvm.minimum.f32(float %f0, float %c)
-  store float %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-; Commute operand of second fminimum
-; GCN-LABEL: {{^}}test_fminimum3_olt_1_f32:
-; GCN: buffer_load_b32 [[REGB:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGA:v[0-9]+]]
-; GCN: buffer_load_b32 [[REGC:v[0-9]+]]
-; GCN: v_minimum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b32 [[RESULT]],
-define amdgpu_kernel void @test_fminimum3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile float, ptr addrspace(1) %aptr, align 4
-  %b = load volatile float, ptr addrspace(1) %bptr, align 4
-  %c = load volatile float, ptr addrspace(1) %cptr, align 4
-  %f0 = call float @llvm.minimum.f32(float %a, float %b)
-  %f1 = call float @llvm.minimum.f32(float %c, float %f0)
-  store float %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_fminimum3_olt_0_f16:
-; GCN: buffer_load_u16 [[REGC:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGB:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGA:v[0-9]+]]
-; GCN: v_minimum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
-; GCN: buffer_store_b16 [[RESULT]],
-define amdgpu_kernel void @test_fminimum3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile half, ptr addrspace(1) %aptr, align 2
-  %b = load volatile half, ptr addrspace(1) %bptr, align 2
-  %c = load volatile half, ptr addrspace(1) %cptr, align 2
-  %f0 = call half @llvm.minimum.f16(half %a, half %b)
-  %f1 = call half @llvm.minimum.f16(half %f0, half %c)
-  store half %f1, ptr addrspace(1) %out, align 2
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_fminimum3_olt_1_f16:
-; GCN: buffer_load_u16 [[REGA:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGB:v[0-9]+]]
-; GCN: buffer_load_u16 [[REGC:v[0-9]+]]
-; GCN: v_minimum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]]
-; GCN: buffer_store_b16 [[RESULT]],
-define amdgpu_kernel void @test_fminimum3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile half, ptr addrspace(1) %aptr, align 2
-  %b = load volatile half, ptr addrspace(1) %bptr, align 2
-  %c = load volatile half, ptr addrspace(1) %cptr, align 2
-  %f0 = call half @llvm.minimum.f16(half %a, half %b)
-  %f1 = call half @llvm.minimum.f16(half %c, half %f0)
-  store half %f1, ptr addrspace(1) %out, align 2
-  ret void
-}
-
-; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of minimum3
-; since there are no pack instructions for fminimum3.
-; GCN-LABEL: {{^}}no_fminimum3_v2f16:
-; GCN: v_pk_minimum_f16 v0, v0, v1
-; GCN: v_pk_minimum_f16 v0, v2, v0
-; GCN: v_pk_minimum_f16 v0, v0, v3
-; GCN-NEXT: s_setpc_b64
-define <2 x half> @no_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
-entry:
-  %min = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
-  %min1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %c, <2 x half> %min)
-  %res = call <2 x half> @llvm.minimum.v2f16(<2 x half> %min1, <2 x half> %d)
-  ret <2 x half> %res
-}
-
-; GCN-LABEL: {{^}}no_fminimum3_olt_0_f64:
-; GCN-COUNT-2: v_minimum_f64
-define amdgpu_kernel void @no_fminimum3_olt_0_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) {
-  %a = load volatile double, ptr addrspace(1) %aptr, align 4
-  %b = load volatile double, ptr addrspace(1) %bptr, align 4
-  %c = load volatile double, ptr addrspace(1) %cptr, align 4
-  %f0 = call double @llvm.minimum.f64(double %a, double %b)
-  %f1 = call double @llvm.minimum.f64(double %f0, double %c)
-  store double %f1, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-declare double @llvm.minimum.f64(double, double)
-declare float @llvm.minimum.f32(float, float)
-declare half @llvm.minimum.f16(half, half)
-declare <2 x half> @llvm.minimum.v2f16(<2 x half>, <2 x half>)
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+
+define float @v_fminimum3_f32(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_commute(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v2, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, v2, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %c, float %max0)
+  ret float %max1
+}
+
+define amdgpu_ps i32 @s_fminimum3_f32(float inreg %a, float inreg %b, float inreg %c) {
+; GFX12-LABEL: s_fminimum3_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum3_f32 v0, s0, s1, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fminimum3_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_min_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, s2, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  %cast = bitcast float %max1 to i32
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+  ret i32 %readfirstlane
+}
+
+define float @v_fminimum3_f32_fabs0(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fabs0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, |v0|, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fabs0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e64 v3, |v0|, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %max0 = call float @llvm.minimum.f32(float %a.fabs, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fabs1(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fabs1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, |v1|, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fabs1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e64 v3, v0, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %max0 = call float @llvm.minimum.f32(float %a, float %b.fabs)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fabs2(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fabs2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fabs2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c.fabs)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fabs_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, |v0|, |v1|, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e64 v3, |v0|, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %max0 = call float @llvm.minimum.f32(float %a.fabs, float %b.fabs)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c.fabs)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fneg_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, -v0, -v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e64 v3, -v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg float %a
+  %b.fneg = fneg float %b
+  %c.fneg = fneg float %c
+  %max0 = call float @llvm.minimum.f32(float %a.fneg, float %b.fneg)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c.fneg)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fneg_fabs_all(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fneg_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, -|v0|, -|v1|, -|v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fneg_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e64 v3, -|v0|, -|v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -|v0|, -|v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e64 v1, v0, -|v2|
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -|v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %a.fneg.fabs = fneg float %a.fabs
+  %b.fneg.fabs = fneg float %b.fabs
+  %c.fneg.fabs = fneg float %c.fabs
+  %max0 = call float @llvm.minimum.f32(float %a.fneg.fabs, float %b.fneg.fabs)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c.fneg.fabs)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fneg0(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fneg0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, -v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fneg0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e64 v3, -v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg float %a
+  %max0 = call float @llvm.minimum.f32(float %a.fneg, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fneg1(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fneg1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, -v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fneg1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e64 v3, v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fneg = fneg float %b
+  %max0 = call float @llvm.minimum.f32(float %a, float %b.fneg)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_fneg2(float %a, float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_fneg2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_fneg2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f32_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fneg = fneg float %c
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c.fneg)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_const0(float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_const0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, 0x41000000, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_const0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v2, 0x41000000, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float 8.0, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32__const2(float %a, float %b) {
+; GFX12-LABEL: v_fminimum3_f32__const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, 0x41000000
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32__const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, 0x41000000, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float 8.0)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_inlineimm0(float %b, float %c) {
+; GFX12-LABEL: v_fminimum3_f32_inlineimm0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, 4.0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_inlineimm0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v2, 4.0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float 4.0, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float %c)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32__inlineimm(float %a, float %b) {
+; GFX12-LABEL: v_fminimum3_f32__inlineimm:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32__inlineimm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, 4.0, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float %a, float %b)
+  %max1 = call float @llvm.minimum.f32(float %max0, float 4.0)
+  ret float %max1
+}
+
+define float @v_fminimum3_f32_const1_const2(float %a) {
+; GFX12-LABEL: v_fminimum3_f32_const1_const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 s0, 0x41000000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, s0, 0x41800000
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f32_const1_const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v1, 0x41000000, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_min_f32_e32 v1, 0x41800000, v0
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call float @llvm.minimum.f32(float %a, float 8.0)
+  %max1 = call float @llvm.minimum.f32(float %max0, float 16.0)
+  ret float %max1
+}
+
+define <2 x float> @v_fminimum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v4, v0, v2
+; GFX12-NEXT:    v_minimum3_f32 v1, v5, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v5, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v5, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
+  %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %c, <2 x float> %max0)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fminimum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v2f32_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v2, v4
+; GFX12-NEXT:    v_minimum3_f32 v1, v1, v3, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f32_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
+  %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> %c)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fminimum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v2f32__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, |v0|, |v2|, |v4|
+; GFX12-NEXT:    v_minimum3_f32 v1, |v1|, |v3|, |v5|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f32__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v7, |v6|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v1|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v6, |v1|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v3|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v3|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, |v3|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v0|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, |v0|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v2|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v2|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v0, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, |v4|, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v4|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v4|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, |v5|, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v5|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v5|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %b.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %b)
+  %c.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %c)
+  %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a.fabs, <2 x float> %b.fabs)
+  %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> %c.fabs)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fminimum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v2f32__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, -v0, -v2, -v4
+; GFX12-NEXT:    v_minimum3_f32 v1, -v1, -v3, -v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f32__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, -v1, -v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v1, -v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v7, -v6, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v6, -v1, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v3, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, -v3, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, -v0, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v2, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v0, -v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, -v4, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v4, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, -v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, -v5, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v5, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <2 x float> %a
+  %b.fneg = fneg <2 x float> %b
+  %c.fneg = fneg <2 x float> %c
+  %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a.fneg, <2 x float> %b.fneg)
+  %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> %c.fneg)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fminimum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v2f32__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, 2.0, v2
+; GFX12-NEXT:    v_minimum3_f32 v1, v1, 2.0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f32__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 2.0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 2.0, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 2.0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, 2.0, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> <float 2.0, float 2.0>)
+  %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> %c)
+  ret <2 x float> %max1
+}
+
+define <2 x float> @v_fminimum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b) {
+; GFX12-LABEL: v_fminimum3_v2f32__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v2, 4.0
+; GFX12-NEXT:    v_minimum3_f32 v1, v1, v3, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f32__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 4.0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 4.0, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 4.0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 4.0, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
+  %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> <float 4.0, float 4.0>)
+  ret <2 x float> %max1
+}
+
+define <3 x float> @v_fminimum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v3f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v6, v0, v3
+; GFX12-NEXT:    v_minimum3_f32 v1, v7, v1, v4
+; GFX12-NEXT:    v_minimum3_f32 v2, v8, v2, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v10, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v6, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v6, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v6, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v7, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v7, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v7, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v2, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v8, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v8, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
+  %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %c, <3 x float> %max0)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fminimum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v3f32_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v3, v6
+; GFX12-NEXT:    v_minimum3_f32 v1, v1, v4, v7
+; GFX12-NEXT:    v_minimum3_f32 v2, v2, v5, v8
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f32_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v10, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v6, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v7, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v8, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
+  %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> %c)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fminimum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v3f32__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, |v0|, |v3|, |v6|
+; GFX12-NEXT:    v_minimum3_f32 v1, |v1|, |v4|, |v7|
+; GFX12-NEXT:    v_minimum3_f32 v2, |v2|, |v5|, |v8|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f32__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v2|, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v2|, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v10, |v9|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v2|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v9, |v2|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v5|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, |v5|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v1|, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v10, |v5|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v1|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, |v1|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v4|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v4|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], |v0|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, |v4|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v0|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, |v0|, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v3|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v3|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v0, |v6|
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, |v6|, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v6|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v6|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, |v6|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, |v7|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, |v7|, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v7|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v7|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, |v7|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v2, |v8|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, |v8|, v2, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v2, |v8|
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], |v8|, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, |v8|, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a)
+  %b.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %b)
+  %c.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %c)
+  %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a.fabs, <3 x float> %b.fabs)
+  %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> %c.fabs)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fminimum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v3f32__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, -v0, -v3, -v6
+; GFX12-NEXT:    v_minimum3_f32 v1, -v1, -v4, -v7
+; GFX12-NEXT:    v_minimum3_f32 v2, -v2, -v5, -v8
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f32__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, -v2, -v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v2, -v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v10, -v9, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v9, -v2, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -v5, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, -v1, -v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v1, -v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v10, -v5, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v4, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 vcc, -v0, -v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], -v0, -v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, -v4, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, -v0, s[4:5]
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v3, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v0, -v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, -v6, v0, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v6, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, -v6, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, -v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, -v7, v1, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v7, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, -v7, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v2, -v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, -v8, v2, s[4:5]
+; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v2, -v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 s[4:5], -v8, 32
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, -v8, s[4:5]
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <3 x float> %a
+  %b.fneg = fneg <3 x float> %b
+  %c.fneg = fneg <3 x float> %c
+  %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a.fneg, <3 x float> %b.fneg)
+  %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> %c.fneg)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fminimum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c) {
+; GFX12-LABEL: v_fminimum3_v3f32__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, 2.0, v3
+; GFX12-NEXT:    v_minimum3_f32 v1, v1, 2.0, v4
+; GFX12-NEXT:    v_minimum3_f32 v2, v2, 2.0, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f32__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 2.0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 2.0, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 2.0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 2.0, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 2.0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 2.0, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> <float 2.0, float 2.0, float 2.0>)
+  %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> %c)
+  ret <3 x float> %max1
+}
+
+define <3 x float> @v_fminimum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b) {
+; GFX12-LABEL: v_fminimum3_v3f32__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f32 v0, v0, v3, 4.0
+; GFX12-NEXT:    v_minimum3_f32 v1, v1, v4, 4.0
+; GFX12-NEXT:    v_minimum3_f32 v2, v2, v5, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f32__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v2, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v5, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v1, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v4, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v0, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_class_f32_e64 vcc, v3, 32
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 4.0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 4.0, v0, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 4.0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 4.0, v1, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, 4.0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 4.0, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
+  %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> <float 4.0, float 4.0, float 4.0>)
+  ret <3 x float> %max1
+}
+
+
+define half @v_fminimum3_f16(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_commute(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v2, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, v2, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %c, half %max0)
+  ret half %max1
+}
+
+define amdgpu_ps i32 @s_fminimum3_f16(half inreg %a, half inreg %b, half inreg %c) {
+; GFX12-LABEL: s_fminimum3_f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum3_f16 v0, s0, s1, v0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fminimum3_f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_min_f16_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, s2, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s2, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  %cast = bitcast half %max1 to i16
+  %zext = zext i16 %cast to i32
+  %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readfirstlane
+}
+
+define half @v_fminimum3_f16_fabs0(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fabs0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, |v0|, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fabs0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e64 v3, |v0|, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call half @llvm.fabs.f16(half %a)
+  %max0 = call half @llvm.minimum.f16(half %a.fabs, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fabs1(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fabs1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, |v1|, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fabs1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e64 v3, v0, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fabs = call half @llvm.fabs.f16(half %b)
+  %max0 = call half @llvm.minimum.f16(half %a, half %b.fabs)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fabs2(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fabs2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fabs2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fabs = call half @llvm.fabs.f16(half %c)
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c.fabs)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fabs_all(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, |v0|, |v1|, |v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e64 v3, |v0|, |v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e64 v1, v0, |v2|
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call half @llvm.fabs.f16(half %a)
+  %b.fabs = call half @llvm.fabs.f16(half %b)
+  %c.fabs = call half @llvm.fabs.f16(half %c)
+  %max0 = call half @llvm.minimum.f16(half %a.fabs, half %b.fabs)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c.fabs)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fneg_all(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, -v0, -v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e64 v3, -v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg half %a
+  %b.fneg = fneg half %b
+  %c.fneg = fneg half %c
+  %max0 = call half @llvm.minimum.f16(half %a.fneg, half %b.fneg)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c.fneg)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fneg_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, -|v0|, -|v1|, -|v2|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fneg_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e64 v3, -|v0|, -|v1|
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -|v0|, -|v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e64 v1, v0, -|v2|
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -|v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call half @llvm.fabs.f16(half %a)
+  %b.fabs = call half @llvm.fabs.f16(half %b)
+  %c.fabs = call half @llvm.fabs.f16(half %c)
+  %a.fneg.fabs = fneg half %a.fabs
+  %b.fneg.fabs = fneg half %b.fabs
+  %c.fneg.fabs = fneg half %c.fabs
+  %max0 = call half @llvm.minimum.f16(half %a.fneg.fabs, half %b.fneg.fabs)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c.fneg.fabs)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fneg0(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fneg0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, -v0, v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fneg0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e64 v3, -v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg half %a
+  %max0 = call half @llvm.minimum.f16(half %a.fneg, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fneg1(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fneg1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, -v1, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fneg1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e64 v3, v0, -v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fneg = fneg half %b
+  %max0 = call half @llvm.minimum.f16(half %a, half %b.fneg)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_fneg2(half %a, half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_fneg2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, -v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_fneg2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    v_min_f16_e64 v1, v0, -v2
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fneg = fneg half %c
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c.fneg)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_const0(half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_const0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, 0x4800, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_const0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v2, 0x4800, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half 8.0, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16__const2(half %a, half %b) {
+; GFX12-LABEL: v_fminimum3_f16__const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, 0x4800
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16__const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, 0x4800, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half 8.0)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_inlineimm0(half %b, half %c) {
+; GFX12-LABEL: v_fminimum3_f16_inlineimm0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, 4.0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_inlineimm0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v2, 4.0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half 4.0, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half %c)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16__inlineimm(half %a, half %b) {
+; GFX12-LABEL: v_fminimum3_f16__inlineimm:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, v1, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16__inlineimm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, 4.0, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half %a, half %b)
+  %max1 = call half @llvm.minimum.f16(half %max0, half 4.0)
+  ret half %max1
+}
+
+define half @v_fminimum3_f16_const1_const2(half %a) {
+; GFX12-LABEL: v_fminimum3_f16_const1_const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_movk_i32 s0, 0x4800
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_minimum3_f16 v0, v0, s0, 0x4c00
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f16_const1_const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v1, 0x4800, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_min_f16_e32 v1, 0x4c00, v0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call half @llvm.minimum.f16(half %a, half 8.0)
+  %max1 = call half @llvm.minimum.f16(half %max0, half 16.0)
+  ret half %max1
+}
+
+define <2 x half> @v_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v2, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v2, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %c, <2 x half> %max0)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fminimum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v2f16_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v3, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fminimum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v2f16__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
+; GFX12-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f16__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v1
+; GFX9-NEXT:    v_pk_min_f16 v3, v3, v4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v2
+; GFX9-NEXT:    v_perm_b32 v1, v4, v0, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a)
+  %b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b)
+  %c.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %c)
+  %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a.fabs, <2 x half> %b.fabs)
+  %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c.fabs)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fminimum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v2f16__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f16__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v3, v0, v1 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v0, v5, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v5, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <2 x half> %a
+  %b.fneg = fneg <2 x half> %b
+  %c.fneg = fneg <2 x half> %c
+  %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a.fneg, <2 x half> %b.fneg)
+  %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c.fneg)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fminimum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v2f16__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f16__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v2, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v3, v0, s4
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> <half 2.0, half 2.0>)
+  %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c)
+  ret <2 x half> %max1
+}
+
+define <2 x half> @v_fminimum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) {
+; GFX12-LABEL: v_fminimum3_v2f16__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v2f16__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v0, v4, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
+  %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> <half 4.0, half 4.0>)
+  ret <2 x half> %max1
+}
+
+define <3 x half> @v_fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v3f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v4, v0
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v5, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v6, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v5, v1
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v6
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v4, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
+  %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %c, <3 x half> %max0)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fminimum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v3f16_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v6, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v5
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v8, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
+  %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fminimum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v3f16__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
+; GFX12-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX12-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v5
+; GFX12-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f16__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v7, 0x7fff7fff, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0x7fff7fff, v3
+; GFX9-NEXT:    v_and_b32_e32 v6, 0x7fff7fff, v0
+; GFX9-NEXT:    v_and_b32_e32 v8, 0x7fff7fff, v2
+; GFX9-NEXT:    v_pk_min_f16 v7, v7, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_pk_min_f16 v6, v6, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v6, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_and_b32_e32 v11, 0x7fff7fff, v4
+; GFX9-NEXT:    v_perm_b32 v2, v8, v0, s4
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v11
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x7fff7fff, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v8, |v4| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_perm_b32 v6, v9, v1, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX9-NEXT:    v_pk_min_f16 v6, v6, v10
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v1, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v6, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a)
+  %b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b)
+  %c.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %c)
+  %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a.fabs, <3 x half> %b.fabs)
+  %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c.fabs)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fminimum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v3f16__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f16__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v1, -v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v6, -v5
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v8, -v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <3 x half> %a
+  %b.fneg = fneg <3 x half> %b
+  %c.fneg = fneg <3 x half> %c
+  %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a.fneg, <3 x half> %b.fneg)
+  %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c.fneg)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fminimum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v3f16__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, 2.0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f16__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX9-NEXT:    v_pk_min_f16 v7, v1, 2.0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX9-NEXT:    s_mov_b32 s5, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v4, v5, v0, s5
+; GFX9-NEXT:    v_pk_min_f16 v4, v4, v2
+; GFX9-NEXT:    s_movk_i32 s4, 0x7e00
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX9-NEXT:    v_pack_b32_f16 v7, v1, s4
+; GFX9-NEXT:    v_pk_min_f16 v7, v7, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> <half 2.0, half 2.0, half 2.0>)
+  %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c)
+  ret <3 x half> %max1
+}
+
+define <3 x half> @v_fminimum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) {
+; GFX12-LABEL: v_fminimum3_v3f16__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v3f16__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v4, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX9-NEXT:    v_pk_min_f16 v1, v1, 4.0
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
+; GFX9-NEXT:    v_perm_b32 v2, v0, v6, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b)
+  %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> <half 4.0, half 4.0, half 4.0>)
+  ret <3 x half> %max1
+}
+
+define <4 x half> @v_fminimum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v4f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v4, v0
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v5, v1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v4f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v6, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s4
+; GFX9-NEXT:    v_pk_min_f16 v2, v5, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v5, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_pk_min_f16 v2, v4, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
+  %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %c, <4 x half> %max0)
+  ret <4 x half> %max1
+}
+
+define <4 x half> @v_fminimum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v4f16_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v4f16_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v6, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s4
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v5
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v4
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v8, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
+  %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c)
+  ret <4 x half> %max1
+}
+
+define <4 x half> @v_fminimum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v4f16__fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v1
+; GFX12-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX12-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v5
+; GFX12-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v4
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v4f16__fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v7, 0x7fff7fff, v0
+; GFX9-NEXT:    v_and_b32_e32 v9, 0x7fff7fff, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 0x7fff7fff, v1
+; GFX9-NEXT:    v_and_b32_e32 v8, 0x7fff7fff, v3
+; GFX9-NEXT:    v_pk_min_f16 v7, v7, v9
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v2| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_pk_min_f16 v6, v6, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, |v1|, |v3| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v2|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, |v1|, |v3|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v6, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_and_b32_e32 v11, 0x7fff7fff, v5
+; GFX9-NEXT:    v_perm_b32 v2, v8, v1, s4
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x7fff7fff, v4
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v11
+; GFX9-NEXT:    v_perm_b32 v6, v9, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_pk_min_f16 v6, v6, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v12, v7, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v1, |v5|
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v2, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v4|
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v6, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v7, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %a)
+  %b.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %b)
+  %c.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %c)
+  %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a.fabs, <4 x half> %b.fabs)
+  %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c.fabs)
+  ret <4 x half> %max1
+}
+
+define <4 x half> @v_fminimum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v4f16__fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v4f16__fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v6, v0, v2 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v0, -v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, -v1, -v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v1, v6, s4
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v6, -v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v8, s4
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_cmp_o_f16_e64 vcc, v8, -v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg <4 x half> %a
+  %b.fneg = fneg <4 x half> %b
+  %c.fneg = fneg <4 x half> %c
+  %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a.fneg, <4 x half> %b.fneg)
+  %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c.fneg)
+  ret <4 x half> %max1
+}
+
+define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) {
+; GFX12-LABEL: v_fminimum3_v4f16__inlineimm1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, 2.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, 2.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v4f16__inlineimm1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_pk_min_f16 v7, v1, 2.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v6, v8, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v4, v8, v1, s4
+; GFX9-NEXT:    v_pk_min_f16 v4, v4, v3
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_perm_b32 v8, v5, v0, s4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX9-NEXT:    v_pk_min_f16 v8, v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v6, v7, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v9, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v7, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> <half 2.0, half 2.0, half 2.0, half 2.0>)
+  %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c)
+  ret <4 x half> %max1
+}
+
+define <4 x half> @v_fminimum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) {
+; GFX12-LABEL: v_fminimum3_v4f16__inlineimm2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, 4.0 op_sel_hi:[1,0]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_v4f16__inlineimm2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_pk_min_f16 v4, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v2, v1, v4, s4
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v2, v0, v6, s4
+; GFX9-NEXT:    v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v6, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
+  %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> <half 4.0, half 4.0, half 4.0, half 4.0>)
+  ret <4 x half> %max1
+}
+
+define double @v_fminimum3_f64(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_commute(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_commute:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[4:5], v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_commute:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[4:5], v[0:1]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %c, double %max0)
+  ret double %max1
+}
+
+define amdgpu_ps <2 x i32> @s_fminimum3_f64(double inreg %a, double inreg %b, double inreg %c) {
+; GFX12-LABEL: s_fminimum3_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], s[4:5]
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fminimum3_f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    ; return to shader part epilog
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  %cast = bitcast double %max1 to <2 x i32>
+  %elt0 = extractelement <2 x i32> %cast, i32 0
+  %elt1 = extractelement <2 x i32> %cast, i32 1
+  %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
+  %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
+  %insert.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0
+  %insert.1 = insertelement <2 x i32> %insert.0, i32 %readlane1, i32 1
+  ret <2 x i32> %insert.1
+}
+
+define double @v_fminimum3_f64_fabs0(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fabs0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], |v[0:1]|, v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fabs0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], |v[0:1]|, v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call double @llvm.fabs.f64(double %a)
+  %max0 = call double @llvm.minimum.f64(double %a.fabs, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fabs1(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fabs1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], |v[2:3]|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fabs1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], |v[2:3]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]|
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fabs = call double @llvm.fabs.f64(double %b)
+  %max0 = call double @llvm.minimum.f64(double %a, double %b.fabs)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fabs2(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fabs2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], |v[4:5]|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fabs2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fabs = call double @llvm.fabs.f64(double %c)
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c.fabs)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fabs_all(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], |v[0:1]|, |v[2:3]|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], |v[4:5]|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], |v[0:1]|, |v[2:3]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]|
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]|
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call double @llvm.fabs.f64(double %a)
+  %b.fabs = call double @llvm.fabs.f64(double %b)
+  %c.fabs = call double @llvm.fabs.f64(double %c)
+  %max0 = call double @llvm.minimum.f64(double %a.fabs, double %b.fabs)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c.fabs)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fneg_all(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fneg_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], -v[0:1], -v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], -v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fneg_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], -v[0:1], -v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg double %a
+  %b.fneg = fneg double %b
+  %c.fneg = fneg double %c
+  %max0 = call double @llvm.minimum.f64(double %a.fneg, double %b.fneg)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c.fneg)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fneg_fabs_all(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fneg_fabs_all:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], -|v[0:1]|, -|v[2:3]|
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], -|v[4:5]|
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fneg_fabs_all:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], -|v[0:1]|, -|v[2:3]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]|
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], -|v[4:5]|
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -|v[4:5]|
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fabs = call double @llvm.fabs.f64(double %a)
+  %b.fabs = call double @llvm.fabs.f64(double %b)
+  %c.fabs = call double @llvm.fabs.f64(double %c)
+  %a.fneg.fabs = fneg double %a.fabs
+  %b.fneg.fabs = fneg double %b.fabs
+  %c.fneg.fabs = fneg double %c.fabs
+  %max0 = call double @llvm.minimum.f64(double %a.fneg.fabs, double %b.fneg.fabs)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c.fneg.fabs)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fneg0(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fneg0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], -v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fneg0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], -v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, -v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %a.fneg = fneg double %a
+  %max0 = call double @llvm.minimum.f64(double %a.fneg, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fneg1(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fneg1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], -v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fneg1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], -v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %b.fneg = fneg double %b
+  %max0 = call double @llvm.minimum.f64(double %a, double %b.fneg)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_fneg2(double %a, double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_fneg2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], -v[4:5]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_fneg2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[6:7], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %c.fneg = fneg double %c
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c.fneg)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_const0(double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_const0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], 0x40200000, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_const0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double 8.0, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64__const2(double %a, double %b) {
+; GFX12-LABEL: v_fminimum3_f64__const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], 0x40200000, v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64__const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double 8.0)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_inlineimm0(double %b, double %c) {
+; GFX12-LABEL: v_fminimum3_f64_inlineimm0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], 4.0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_inlineimm0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], 4.0
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double 4.0, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double %c)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64__inlineimm(double %a, double %b) {
+; GFX12-LABEL: v_fminimum3_f64__inlineimm:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], 4.0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64__inlineimm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], 4.0
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double %a, double %b)
+  %max1 = call double @llvm.minimum.f64(double %max0, double 4.0)
+  ret double %max1
+}
+
+define double @v_fminimum3_f64_const1_const2(double %a) {
+; GFX12-LABEL: v_fminimum3_f64_const1_const2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_minimum_f64 v[0:1], 0x40200000, v[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_minimum_f64 v[0:1], 0x40300000, v[0:1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fminimum3_f64_const1_const2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7ff80000
+; GFX9-NEXT:    s_mov_b32 s4, 0
+; GFX9-NEXT:    s_mov_b32 s5, 0x40300000
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_min_f64 v[2:3], v[0:1], s[4:5]
+; GFX9-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %max0 = call double @llvm.minimum.f64(double %a, double 8.0)
+  %max1 = call double @llvm.minimum.f64(double %max0, double 16.0)
+  ret double %max1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
index 64063f65e288..04ef30bd26aa 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -253,25 +253,25 @@ define amdgpu_kernel void @fp_to_sint_i64 (ptr addrspace(1) %out, float %in) {
 ; EG-NEXT:     ADD_INT * T2.W, PV.W, literal.y,
 ; EG-NEXT:    8388608(1.175494e-38), -150(nan)
 ; EG-NEXT:     ADD_INT T0.X, T0.W, literal.x,
-; EG-NEXT:     SUB_INT T0.Y, literal.y, T0.W,
-; EG-NEXT:     AND_INT T0.Z, PS, literal.z,
+; EG-NEXT:     AND_INT T0.Y, PS, literal.y,
+; EG-NEXT:     SUB_INT T0.Z, literal.z, T0.W,
 ; EG-NEXT:     NOT_INT T0.W, PS,
 ; EG-NEXT:     LSHR * T3.W, PV.W, 1,
-; EG-NEXT:    -127(nan), 150(2.101948e-43)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:    -127(nan), 31(4.344025e-44)
+; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
 ; EG-NEXT:     BIT_ALIGN_INT T1.X, 0.0, PS, PV.W,
-; EG-NEXT:     LSHL T1.Y, T1.W, PV.Z,
-; EG-NEXT:     AND_INT T0.Z, T2.W, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, T1.W, PV.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.x,
+; EG-NEXT:     AND_INT T1.Y, PV.Z, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, T1.W, PV.Z,
+; EG-NEXT:     LSHL T0.W, T1.W, PV.Y,
+; EG-NEXT:     AND_INT * T1.W, T2.W, literal.x,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     CNDE_INT T0.Y, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T1.Z, PV.Z, PV.Y, 0.0,
-; EG-NEXT:     CNDE_INT T0.W, PV.Z, PV.X, PV.Y,
+; EG-NEXT:     CNDE_INT T0.Z, PV.Y, PV.Z, 0.0,
+; EG-NEXT:     CNDE_INT T0.W, PS, PV.X, PV.W,
 ; EG-NEXT:     SETGT_INT * T1.W, T0.X, literal.x,
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T0.Z, PS, 0.0, PV.W,
-; EG-NEXT:     CNDE_INT T0.W, PS, PV.Y, PV.Z,
+; EG-NEXT:     CNDE_INT T1.Z, PS, 0.0, PV.W,
+; EG-NEXT:     CNDE_INT T0.W, PS, PV.Z, PV.Y,
 ; EG-NEXT:     ASHR * T1.W, KC0[2].Z, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ; EG-NEXT:     XOR_INT T0.W, PV.W, PS,
@@ -364,79 +364,78 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> %
 ;
 ; EG-LABEL: fp_to_sint_v2i64:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 75, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 74, @4, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    ALU clause starting at 4:
 ; EG-NEXT:     MOV * T0.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT * T1.W, KC0[2].W, literal.x, PV.W,
-; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T0.Z, KC0[2].W, literal.x,
-; EG-NEXT:     BFE_UINT T0.W, KC0[3].X, literal.y, T0.W,
-; EG-NEXT:     ADD_INT * T2.W, PV.W, literal.z,
-; EG-NEXT:    8388607(1.175494e-38), 23(3.222986e-44)
+; EG-NEXT:     BFE_UINT T0.Z, KC0[3].X, literal.x, PV.W,
+; EG-NEXT:     BFE_UINT T0.W, KC0[2].W, literal.x, PV.W,
+; EG-NEXT:     AND_INT * T1.Z, KC0[2].W, literal.y,
+; EG-NEXT:    23(3.222986e-44), 8388607(1.175494e-38)
+; EG-NEXT:     ADD_INT T1.W, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, PV.Z, literal.x,
 ; EG-NEXT:    -150(nan), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T0.X, literal.x, PV.W,
-; EG-NEXT:     SUB_INT T0.Y, literal.x, T1.W,
-; EG-NEXT:     AND_INT T1.Z, PS, literal.y,
-; EG-NEXT:     OR_INT T3.W, PV.Z, literal.z,
+; EG-NEXT:     AND_INT T0.X, PS, literal.x,
+; EG-NEXT:     AND_INT T0.Y, PV.W, literal.x,
+; EG-NEXT:     OR_INT T1.Z, T1.Z, literal.y,
+; EG-NEXT:     SUB_INT T3.W, literal.z, T0.W,
 ; EG-NEXT:     AND_INT * T4.W, KC0[3].X, literal.w,
-; EG-NEXT:    150(2.101948e-43), 31(4.344025e-44)
-; EG-NEXT:    8388608(1.175494e-38), 8388607(1.175494e-38)
+; EG-NEXT:    31(4.344025e-44), 8388608(1.175494e-38)
+; EG-NEXT:    150(2.101948e-43), 8388607(1.175494e-38)
 ; EG-NEXT:     OR_INT T1.X, PS, literal.x,
-; EG-NEXT:     LSHL T1.Y, PV.W, PV.Z,
-; EG-NEXT:     AND_INT T0.Z, T2.W, literal.y,
-; EG-NEXT:     BIT_ALIGN_INT T4.W, 0.0, PV.W, PV.Y,
-; EG-NEXT:     AND_INT * T5.W, PV.Y, literal.y,
+; EG-NEXT:     AND_INT T1.Y, PV.W, literal.y,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, 0.0, PV.Z, PV.W,
+; EG-NEXT:     LSHL T3.W, PV.Z, PV.Y,
+; EG-NEXT:     AND_INT * T4.W, T1.W, literal.y,
 ; EG-NEXT:    8388608(1.175494e-38), 32(4.484155e-44)
-; EG-NEXT:     CNDE_INT T2.X, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T0.Y, PV.Z, PV.Y, 0.0,
-; EG-NEXT:     ADD_INT T1.Z, T0.W, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T4.W, 0.0, PV.X, T0.X,
-; EG-NEXT:     AND_INT * T5.W, T0.X, literal.y,
-; EG-NEXT:    -150(nan), 32(4.484155e-44)
+; EG-NEXT:     CNDE_INT T0.Y, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T2.Z, PV.Y, PV.Z, 0.0,
+; EG-NEXT:     LSHL T5.W, PV.X, T0.X,
+; EG-NEXT:     AND_INT * T6.W, T2.W, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     CNDE_INT T0.X, PS, PV.W, 0.0,
-; EG-NEXT:     NOT_INT T2.Y, T2.W,
-; EG-NEXT:     AND_INT T2.Z, PV.Z, literal.x,
-; EG-NEXT:     NOT_INT T2.W, PV.Z,
-; EG-NEXT:     LSHR * T4.W, T1.X, 1,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T3.X, T3.W, 1,
-; EG-NEXT:     ADD_INT T3.Y, T0.W, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     BIT_ALIGN_INT T3.Z, 0.0, PS, PV.W,
-; EG-NEXT:     LSHL T0.W, T1.X, PV.Z,
-; EG-NEXT:     AND_INT * T2.W, T1.Z, literal.y,
+; EG-NEXT:     NOT_INT T1.Y, T1.W,
+; EG-NEXT:     SUB_INT T3.Z, literal.x, T0.Z,
+; EG-NEXT:     NOT_INT T1.W, T2.W, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T2.W, T1.X, 1,
+; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
+; EG-NEXT:     LSHR T2.X, T1.Z, 1,
+; EG-NEXT:     ADD_INT T2.Y, T0.Z, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, PS, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, 0.0, T1.X, PV.Z,
+; EG-NEXT:     AND_INT * T2.W, PV.Z, literal.y,
 ; EG-NEXT:    -127(nan), 32(4.484155e-44)
 ; EG-NEXT:     CNDE_INT T1.X, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T4.Y, PS, PV.Z, PV.W,
-; EG-NEXT:     SETGT_INT T1.Z, PV.Y, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, PV.X, T2.Y,
-; EG-NEXT:     ADD_INT * T1.W, T1.W, literal.y,
+; EG-NEXT:     CNDE_INT T3.Y, T6.W, PV.Z, T5.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGT_INT T0.Z, PV.Y, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, 0.0, PV.X, T1.Y,
+; EG-NEXT:     ADD_INT * T0.W, T0.W, literal.y,
 ; EG-NEXT:    23(3.222986e-44), -127(nan)
-; EG-NEXT:     CNDE_INT T3.X, T0.Z, PV.W, T1.Y,
+; EG-NEXT:     CNDE_INT T2.X, T4.W, PV.W, T3.W,
 ; EG-NEXT:     SETGT_INT T1.Y, PS, literal.x,
-; EG-NEXT:     CNDE_INT T0.Z, PV.Z, 0.0, PV.Y,
-; EG-NEXT:     CNDE_INT T0.W, PV.Z, T0.X, PV.X,
+; EG-NEXT:     CNDE_INT T1.Z, PV.Z, 0.0, PV.Y,
+; EG-NEXT:     CNDE_INT T1.W, PV.Z, PV.X, T0.X,
 ; EG-NEXT:     ASHR * T2.W, KC0[3].X, literal.y,
 ; EG-NEXT:    23(3.222986e-44), 31(4.344025e-44)
 ; EG-NEXT:     XOR_INT T0.X, PV.W, PS,
-; EG-NEXT:     XOR_INT T2.Y, PV.Z, PS,
+; EG-NEXT:     XOR_INT T3.Y, PV.Z, PS,
 ; EG-NEXT:     CNDE_INT T0.Z, PV.Y, 0.0, PV.X,
-; EG-NEXT:     CNDE_INT T0.W, PV.Y, T2.X, T0.Y,
+; EG-NEXT:     CNDE_INT T1.W, PV.Y, T2.Z, T0.Y,
 ; EG-NEXT:     ASHR * T3.W, KC0[2].W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ; EG-NEXT:     XOR_INT T0.Y, PV.W, PS,
 ; EG-NEXT:     XOR_INT T0.Z, PV.Z, PS,
-; EG-NEXT:     SUB_INT T0.W, PV.Y, T2.W,
+; EG-NEXT:     SUB_INT T1.W, PV.Y, T2.W,
 ; EG-NEXT:     SUBB_UINT * T4.W, PV.X, T2.W,
 ; EG-NEXT:     SUB_INT T1.Y, PV.W, PS,
-; EG-NEXT:     SETGT_INT T1.Z, 0.0, T3.Y,
-; EG-NEXT:     SUB_INT T0.W, PV.Z, T3.W,
+; EG-NEXT:     SETGT_INT T1.Z, 0.0, T2.Y,
+; EG-NEXT:     SUB_INT T1.W, PV.Z, T3.W,
 ; EG-NEXT:     SUBB_UINT * T4.W, PV.Y, T3.W,
 ; EG-NEXT:     SUB_INT T0.Z, PV.W, PS,
-; EG-NEXT:     SETGT_INT T0.W, 0.0, T1.W,
+; EG-NEXT:     SETGT_INT T0.W, 0.0, T0.W,
 ; EG-NEXT:     CNDE_INT * T1.W, PV.Z, PV.Y, 0.0,
 ; EG-NEXT:     CNDE_INT T1.Y, PV.W, PV.Z, 0.0,
 ; EG-NEXT:     SUB_INT * T2.W, T0.X, T2.W,
@@ -567,170 +566,168 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> %
 ;
 ; EG-LABEL: fp_to_sint_v4i64:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 101, @6, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    ALU 54, @108, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T2.X, 1
+; EG-NEXT:    ALU 99, @6, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 54, @106, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T0.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    ALU clause starting at 6:
 ; EG-NEXT:     MOV * T0.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T1.W, KC0[4].X, literal.x, PV.W,
-; EG-NEXT:     AND_INT * T2.W, KC0[4].X, literal.y,
+; EG-NEXT:     BFE_UINT T1.W, KC0[3].Z, literal.x, PV.W,
+; EG-NEXT:     AND_INT * T2.W, KC0[3].Z, literal.y,
 ; EG-NEXT:    23(3.222986e-44), 8388607(1.175494e-38)
-; EG-NEXT:     OR_INT T0.Z, PS, literal.x,
-; EG-NEXT:     BFE_UINT T2.W, KC0[3].Z, literal.y, T0.W,
-; EG-NEXT:     ADD_INT * T3.W, PV.W, literal.z,
-; EG-NEXT:    8388608(1.175494e-38), 23(3.222986e-44)
-; EG-NEXT:    -150(nan), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT T0.Y, PV.W, literal.x,
-; EG-NEXT:     AND_INT T1.Z, PS, literal.y,
-; EG-NEXT:     NOT_INT T4.W, PS,
-; EG-NEXT:     LSHR * T5.W, PV.Z, 1,
-; EG-NEXT:    -127(nan), 31(4.344025e-44)
+; EG-NEXT:     OR_INT T2.W, PS, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, PV.W, literal.y,
+; EG-NEXT:    8388608(1.175494e-38), -150(nan)
 ; EG-NEXT:     ADD_INT T0.X, T1.W, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T1.Y, 0.0, PS, PV.W,
-; EG-NEXT:     AND_INT T2.Z, T3.W, literal.y, BS:VEC_201
-; EG-NEXT:     LSHL T3.W, T0.Z, PV.Z,
-; EG-NEXT:     SUB_INT * T1.W, literal.z, T1.W,
-; EG-NEXT:    -127(nan), 32(4.484155e-44)
-; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.X, PS, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T2.Y, 0.0, T0.Z, PS,
-; EG-NEXT:     AND_INT T0.Z, KC0[3].Z, literal.y,
-; EG-NEXT:     CNDE_INT T1.W, PV.Z, PV.Y, PV.W,
-; EG-NEXT:     SETGT_INT * T4.W, PV.X, literal.z,
+; EG-NEXT:     BFE_UINT T0.Y, KC0[4].X, literal.y, T0.W,
+; EG-NEXT:     AND_INT T0.Z, PS, literal.z,
+; EG-NEXT:     NOT_INT T4.W, PS,
+; EG-NEXT:     LSHR * T5.W, PV.W, 1,
+; EG-NEXT:    -127(nan), 23(3.222986e-44)
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BIT_ALIGN_INT T1.X, 0.0, PS, PV.W,
+; EG-NEXT:     AND_INT T1.Y, T3.W, literal.x,
+; EG-NEXT:     LSHL T0.Z, T2.W, PV.Z, BS:VEC_120/SCL_212
+; EG-NEXT:     AND_INT T3.W, KC0[4].X, literal.y,
+; EG-NEXT:     ADD_INT * T4.W, PV.Y, literal.z,
 ; EG-NEXT:    32(4.484155e-44), 8388607(1.175494e-38)
+; EG-NEXT:    -150(nan), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T2.Y, PS, literal.x,
+; EG-NEXT:     OR_INT T1.Z, PV.W, literal.y,
+; EG-NEXT:     CNDE_INT T3.W, PV.Y, PV.X, PV.Z,
+; EG-NEXT:     SETGT_INT * T5.W, T0.X, literal.z,
+; EG-NEXT:    31(4.344025e-44), 8388608(1.175494e-38)
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T2.X, PS, 0.0, PV.W,
-; EG-NEXT:     OR_INT T1.Y, PV.Z, literal.x,
-; EG-NEXT:     ADD_INT T0.Z, T2.W, literal.y,
-; EG-NEXT:     CNDE_INT T1.W, PV.X, PV.Y, 0.0,
-; EG-NEXT:     CNDE_INT * T3.W, T2.Z, T3.W, 0.0,
-; EG-NEXT:    8388608(1.175494e-38), -150(nan)
-; EG-NEXT:     CNDE_INT T1.X, T4.W, PV.W, PS,
-; EG-NEXT:     ASHR T2.Y, KC0[4].X, literal.x,
-; EG-NEXT:     AND_INT T1.Z, PV.Z, literal.x,
-; EG-NEXT:     NOT_INT T1.W, PV.Z,
-; EG-NEXT:     LSHR * T3.W, PV.Y, 1,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.X, 0.0, PS, PV.W,
-; EG-NEXT:     LSHL T3.Y, T1.Y, PV.Z,
-; EG-NEXT:     XOR_INT T1.Z, PV.X, PV.Y,
-; EG-NEXT:     XOR_INT T1.W, T2.X, PV.Y,
-; EG-NEXT:     SUB_INT * T2.W, literal.x, T2.W,
-; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.X, T0.Z, literal.x,
-; EG-NEXT:     AND_INT T4.Y, PS, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, T1.Y, PS, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T1.W, PV.W, T2.Y,
-; EG-NEXT:     SUBB_UINT * T2.W, PV.Z, T2.Y,
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T2.X, PV.W, PS,
-; EG-NEXT:     CNDE_INT T1.Y, PV.Y, PV.Z, 0.0,
-; EG-NEXT:     CNDE_INT T0.Z, PV.X, T3.Y, 0.0,
-; EG-NEXT:     CNDE_INT T1.W, PV.X, T3.X, T3.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGT_INT * T2.W, T0.Y, literal.x,
+; EG-NEXT:     CNDE_INT T3.Y, PS, 0.0, PV.W,
+; EG-NEXT:     SUB_INT T2.Z, literal.x, T1.W,
+; EG-NEXT:     LSHL T1.W, PV.Z, PV.Y,
+; EG-NEXT:     AND_INT * T3.W, T4.W, literal.y,
+; EG-NEXT:    150(2.101948e-43), 32(4.484155e-44)
+; EG-NEXT:     CNDE_INT T1.X, PS, PV.W, 0.0,
+; EG-NEXT:     AND_INT T2.Y, PV.Z, literal.x,
+; EG-NEXT:     SUB_INT T3.Z, literal.y, T0.Y,
+; EG-NEXT:     NOT_INT T4.W, T4.W,
+; EG-NEXT:     LSHR * T6.W, T1.Z, 1,
+; EG-NEXT:    32(4.484155e-44), 150(2.101948e-43)
+; EG-NEXT:     BIT_ALIGN_INT T2.X, 0.0, T2.W, T2.Z,
+; EG-NEXT:     ADD_INT T0.Y, T0.Y, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, 0.0, PS, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T2.W, 0.0, T1.Z, PV.Z,
+; EG-NEXT:     AND_INT * T4.W, PV.Z, literal.y,
+; EG-NEXT:    -127(nan), 32(4.484155e-44)
+; EG-NEXT:     CNDE_INT T3.X, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T4.Y, T3.W, PV.Z, T1.W,
+; EG-NEXT:     SETGT_INT T1.Z, PV.Y, literal.x,
+; EG-NEXT:     CNDE_INT T1.W, T1.Y, T0.Z, 0.0,
+; EG-NEXT:     CNDE_INT * T2.W, T2.Y, PV.X, 0.0,
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T1.X, KC0[3].W, literal.x, T0.W,
-; EG-NEXT:     AND_INT T3.Y, KC0[3].W, literal.y,
-; EG-NEXT:     CNDE_INT T2.Z, PS, 0.0, PV.W,
-; EG-NEXT:     CNDE_INT T1.W, PS, PV.Y, PV.Z,
-; EG-NEXT:     ASHR * T2.W, KC0[3].Z, literal.z,
-; EG-NEXT:    23(3.222986e-44), 8388607(1.175494e-38)
+; EG-NEXT:     CNDE_INT T2.X, T5.W, PS, PV.W,
+; EG-NEXT:     ASHR T1.Y, KC0[3].Z, literal.x,
+; EG-NEXT:     CNDE_INT T0.Z, PV.Z, 0.0, PV.Y,
+; EG-NEXT:     CNDE_INT T1.W, PV.Z, PV.X, T1.X,
+; EG-NEXT:     ASHR * T2.W, KC0[4].X, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T3.X, KC0[3].Y, literal.x, T0.W,
-; EG-NEXT:     XOR_INT T1.Y, PV.W, PS,
+; EG-NEXT:     XOR_INT T2.Y, PV.W, PS,
 ; EG-NEXT:     XOR_INT T0.Z, PV.Z, PS,
-; EG-NEXT:     OR_INT T0.W, PV.Y, literal.y,
-; EG-NEXT:     SUB_INT * T1.W, literal.z, PV.X,
-; EG-NEXT:    23(3.222986e-44), 8388608(1.175494e-38)
+; EG-NEXT:     XOR_INT T1.W, PV.X, PV.Y,
+; EG-NEXT:     XOR_INT * T3.W, T3.Y, PV.Y,
+; EG-NEXT:     SUB_INT T3.Y, PS, T1.Y,
+; EG-NEXT:     SUBB_UINT T1.Z, PV.W, T1.Y,
+; EG-NEXT:     SUB_INT T3.W, PV.Z, T2.W,
+; EG-NEXT:     SUBB_UINT * T4.W, PV.Y, T2.W,
+; EG-NEXT:     SUB_INT T4.Y, PV.W, PS,
+; EG-NEXT:     SUB_INT T0.Z, PV.Y, PV.Z,
+; EG-NEXT:     BFE_UINT T3.W, KC0[3].Y, literal.x, T0.W,
+; EG-NEXT:     AND_INT * T4.W, KC0[3].Y, literal.y,
+; EG-NEXT:    23(3.222986e-44), 8388607(1.175494e-38)
+; EG-NEXT:     SETGT_INT T0.X, 0.0, T0.X,
+; EG-NEXT:     ADD_INT T3.Y, PV.W, literal.x,
+; EG-NEXT:     OR_INT T1.Z, PS, literal.y,
+; EG-NEXT:     BFE_UINT T0.W, KC0[3].W, literal.z, T0.W,
+; EG-NEXT:     ADD_INT * T4.W, PV.W, literal.w,
+; EG-NEXT:    -127(nan), 8388608(1.175494e-38)
+; EG-NEXT:    23(3.222986e-44), -150(nan)
+; EG-NEXT:     AND_INT T1.X, KC0[3].W, literal.x,
+; EG-NEXT:     ADD_INT T5.Y, PV.W, literal.y,
+; EG-NEXT:     SUB_INT T2.Z, literal.z, T3.W,
+; EG-NEXT:     NOT_INT T3.W, PS,
+; EG-NEXT:     LSHR * T5.W, PV.Z, 1,
+; EG-NEXT:    8388607(1.175494e-38), -150(nan)
 ; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T4.X, KC0[3].Y, literal.x,
-; EG-NEXT:     AND_INT T3.Y, PS, literal.y,
-; EG-NEXT:     BIT_ALIGN_INT T2.Z, 0.0, PV.W, PS,
-; EG-NEXT:     SUB_INT T1.W, PV.Z, T2.W,
-; EG-NEXT:     SUBB_UINT * T3.W, PV.Y, T2.W,
-; EG-NEXT:    8388607(1.175494e-38), 32(4.484155e-44)
-; EG-NEXT:     SUB_INT T5.X, PV.W, PS,
-; EG-NEXT:     SETGT_INT T0.Y, 0.0, T0.Y,
-; EG-NEXT:     CNDE_INT T0.Z, PV.Y, PV.Z, 0.0,
-; EG-NEXT:     OR_INT T1.W, PV.X, literal.x,
-; EG-NEXT:     ADD_INT * T3.W, T3.X, literal.y,
-; EG-NEXT:    8388608(1.175494e-38), -150(nan)
-; EG-NEXT:     ADD_INT T4.X, T3.X, literal.x,
-; EG-NEXT:     SUB_INT T3.Y, literal.y, T3.X,
-; EG-NEXT:     AND_INT T2.Z, PS, literal.z,
-; EG-NEXT:     NOT_INT T4.W, PS,
-; EG-NEXT:     LSHR * T5.W, PV.W, 1,
-; EG-NEXT:    -127(nan), 150(2.101948e-43)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.X, 0.0, PS, PV.W,
-; EG-NEXT:     LSHL T4.Y, T1.W, PV.Z,
-; EG-NEXT:     AND_INT T2.Z, T3.W, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     BIT_ALIGN_INT T1.W, 0.0, T1.W, PV.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     AND_INT * T3.W, PV.Y, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T2.X, 0.0, PS, PV.W,
+; EG-NEXT:     AND_INT T6.Y, PV.Z, literal.x,
+; EG-NEXT:     AND_INT T3.Z, PV.Y, literal.y,
+; EG-NEXT:     OR_INT T3.W, PV.X, literal.z,
+; EG-NEXT:     AND_INT * T5.W, T4.W, literal.y,
+; EG-NEXT:    32(4.484155e-44), 31(4.344025e-44)
+; EG-NEXT:    8388608(1.175494e-38), 0(0.000000e+00)
+; EG-NEXT:     BIT_ALIGN_INT T1.X, 0.0, T1.Z, T2.Z,
+; EG-NEXT:     LSHL T7.Y, T1.Z, PS,
+; EG-NEXT:     AND_INT T1.Z, T4.W, literal.x,
+; EG-NEXT:     LSHL T4.W, PV.W, PV.Z,
+; EG-NEXT:     AND_INT * T5.W, T5.Y, literal.x,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT T6.X, T1.X, literal.x,
-; EG-NEXT:     CNDE_INT T3.Y, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT * T3.Z, PV.Z, PV.Y, 0.0,
-; EG-NEXT:    -150(nan), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 108:
-; EG-NEXT:     CNDE_INT T1.W, T2.Z, T3.X, T4.Y,
-; EG-NEXT:     SETGT_INT * T3.W, T4.X, literal.x,
+; EG-NEXT:     CNDE_INT T3.X, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T8.Y, PV.Z, PV.Y, 0.0,
+; EG-NEXT:     CNDE_INT * T2.Z, T6.Y, PV.X, 0.0,
+; EG-NEXT:    ALU clause starting at 106:
+; EG-NEXT:     CNDE_INT T6.W, T1.Z, T2.X, T7.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGT_INT * T7.W, T3.Y, literal.x,
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T3.X, PS, 0.0, PV.W,
-; EG-NEXT:     CNDE_INT T3.Y, PS, T3.Y, T3.Z,
-; EG-NEXT:     AND_INT T2.Z, T6.X, literal.x,
-; EG-NEXT:     NOT_INT T1.W, T6.X,
-; EG-NEXT:     LSHR * T3.W, T0.W, 1,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ASHR T7.X, KC0[3].Y, literal.x,
-; EG-NEXT:     ADD_INT T4.Y, T1.X, literal.y,
-; EG-NEXT:     BIT_ALIGN_INT T3.Z, 0.0, PS, PV.W,
-; EG-NEXT:     LSHL T0.W, T0.W, PV.Z,
-; EG-NEXT:     AND_INT * T1.W, T6.X, literal.z,
+; EG-NEXT:     CNDE_INT T1.X, PS, 0.0, PV.W,
+; EG-NEXT:     CNDE_INT T6.Y, PS, T2.Z, T8.Y,
+; EG-NEXT:     SUB_INT T1.Z, literal.x, T0.W,
+; EG-NEXT:     NOT_INT T6.W, T5.Y,
+; EG-NEXT:     LSHR * T7.W, T3.W, 1,
+; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
+; EG-NEXT:     ASHR T2.X, KC0[3].Y, literal.x,
+; EG-NEXT:     ADD_INT T5.Y, T0.W, literal.y,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, 0.0, PS, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, T3.W, PV.Z,
+; EG-NEXT:     AND_INT * T3.W, PV.Z, literal.z,
 ; EG-NEXT:    31(4.344025e-44), -127(nan)
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T1.X, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T5.Y, PS, PV.Z, PV.W,
-; EG-NEXT:     SETGT_INT T2.Z, PV.Y, literal.x,
-; EG-NEXT:     XOR_INT T0.W, T3.Y, PV.X,
-; EG-NEXT:     XOR_INT * T1.W, T3.X, PV.X,
+; EG-NEXT:     CNDE_INT T4.X, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T7.Y, T5.W, PV.Z, T4.W,
+; EG-NEXT:     SETGT_INT T1.Z, PV.Y, literal.x,
+; EG-NEXT:     XOR_INT T0.W, T6.Y, PV.X,
+; EG-NEXT:     XOR_INT * T3.W, T1.X, PV.X,
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T3.X, PS, T7.X,
-; EG-NEXT:     SUBB_UINT T3.Y, PV.W, T7.X,
-; EG-NEXT:     CNDE_INT T3.Z, PV.Z, 0.0, PV.Y,
-; EG-NEXT:     CNDE_INT T1.W, PV.Z, T0.Z, PV.X,
-; EG-NEXT:     ASHR * T3.W, KC0[3].W, literal.x,
+; EG-NEXT:     SUB_INT T1.X, PS, T2.X,
+; EG-NEXT:     SUBB_UINT T6.Y, PV.W, T2.X,
+; EG-NEXT:     CNDE_INT T2.Z, PV.Z, 0.0, PV.Y,
+; EG-NEXT:     CNDE_INT T3.W, PV.Z, PV.X, T3.X,
+; EG-NEXT:     ASHR * T4.W, KC0[3].W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     XOR_INT T1.X, PV.W, PS,
-; EG-NEXT:     XOR_INT T5.Y, PV.Z, PS,
-; EG-NEXT:     SUB_INT T0.Z, PV.X, PV.Y,
-; EG-NEXT:     SETGT_INT T1.W, 0.0, T4.X, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T6.W, T0.Y, T5.X, 0.0,
-; EG-NEXT:     SETGT_INT T0.X, 0.0, T0.X,
+; EG-NEXT:     XOR_INT T3.X, PV.W, PS,
+; EG-NEXT:     XOR_INT T7.Y, PV.Z, PS,
+; EG-NEXT:     SUB_INT T1.Z, PV.X, PV.Y,
+; EG-NEXT:     SETGT_INT T3.W, 0.0, T3.Y,
+; EG-NEXT:     CNDE_INT * T6.W, T0.X, T0.Z, 0.0,
+; EG-NEXT:     SETGT_INT T1.X, 0.0, T0.Y,
 ; EG-NEXT:     CNDE_INT T6.Y, PV.W, PV.Z, 0.0,
-; EG-NEXT:     SUB_INT T0.Z, T1.Y, T2.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T2.W, PV.Y, T3.W,
-; EG-NEXT:     SUBB_UINT * T4.W, PV.X, T3.W,
-; EG-NEXT:     SUB_INT T3.X, PV.W, PS,
-; EG-NEXT:     SETGT_INT T1.Y, 0.0, T4.Y,
-; EG-NEXT:     CNDE_INT T6.Z, T0.Y, PV.Z, 0.0,
-; EG-NEXT:     SUB_INT T0.W, T0.W, T7.X, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, PV.X, T2.X, 0.0,
-; EG-NEXT:     CNDE_INT T6.X, T1.W, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T4.Y, PV.Y, PV.X, 0.0,
-; EG-NEXT:     SUB_INT T0.W, T1.Z, T2.Y,
-; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
+; EG-NEXT:     SUB_INT T0.Z, T1.W, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T1.W, PV.Y, T4.W,
+; EG-NEXT:     SUBB_UINT * T5.W, PV.X, T4.W,
+; EG-NEXT:     SUB_INT T4.X, PV.W, PS,
+; EG-NEXT:     SETGT_INT T0.Y, 0.0, T5.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T6.Z, T0.X, PV.Z, 0.0,
+; EG-NEXT:     SUB_INT T0.W, T0.W, T2.X,
+; EG-NEXT:     CNDE_INT * T1.W, PV.X, T4.Y, 0.0,
+; EG-NEXT:     CNDE_INT T6.X, T3.W, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T1.Y, PV.Y, PV.X, 0.0,
+; EG-NEXT:     SUB_INT T0.W, T2.Y, T2.W,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.Z, T0.X, PV.W, 0.0,
-; EG-NEXT:     SUB_INT * T0.W, T1.X, T3.W, BS:VEC_120/SCL_212
-; EG-NEXT:     CNDE_INT T4.X, T1.Y, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T1.Z, T1.X, PV.W, 0.0,
+; EG-NEXT:     SUB_INT * T0.W, T3.X, T4.W, BS:VEC_120/SCL_212
+; EG-NEXT:     CNDE_INT T1.X, T0.Y, PV.W, 0.0,
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T0.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T2.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %conv = fptosi <4 x float> %x to <4 x i64>
   store <4 x i64> %conv, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
index 5170f9c76db2..5abf82aa1aab 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -200,25 +200,25 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i64(ptr addrspace(1) %out, float %x
 ; EG-NEXT:     ADD_INT * T2.W, PV.W, literal.y,
 ; EG-NEXT:    8388608(1.175494e-38), -150(nan)
 ; EG-NEXT:     ADD_INT T0.X, T0.W, literal.x,
-; EG-NEXT:     SUB_INT T0.Y, literal.y, T0.W,
-; EG-NEXT:     AND_INT T0.Z, PS, literal.z,
+; EG-NEXT:     AND_INT T0.Y, PS, literal.y,
+; EG-NEXT:     SUB_INT T0.Z, literal.z, T0.W,
 ; EG-NEXT:     NOT_INT T0.W, PS,
 ; EG-NEXT:     LSHR * T3.W, PV.W, 1,
-; EG-NEXT:    -127(nan), 150(2.101948e-43)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:    -127(nan), 31(4.344025e-44)
+; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
 ; EG-NEXT:     BIT_ALIGN_INT T1.X, 0.0, PS, PV.W,
-; EG-NEXT:     LSHL T1.Y, T1.W, PV.Z,
-; EG-NEXT:     AND_INT T0.Z, T2.W, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, T1.W, PV.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.x,
+; EG-NEXT:     AND_INT T1.Y, PV.Z, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, T1.W, PV.Z,
+; EG-NEXT:     LSHL T0.W, T1.W, PV.Y,
+; EG-NEXT:     AND_INT * T1.W, T2.W, literal.x,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     CNDE_INT T0.Y, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T1.Z, PV.Z, PV.Y, 0.0,
-; EG-NEXT:     CNDE_INT T0.W, PV.Z, PV.X, PV.Y,
+; EG-NEXT:     CNDE_INT T0.Z, PV.Y, PV.Z, 0.0,
+; EG-NEXT:     CNDE_INT T0.W, PS, PV.X, PV.W,
 ; EG-NEXT:     SETGT_INT * T1.W, T0.X, literal.x,
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T0.Z, PS, 0.0, PV.W,
-; EG-NEXT:     CNDE_INT T0.W, PS, PV.Y, PV.Z,
+; EG-NEXT:     CNDE_INT T1.Z, PS, 0.0, PV.W,
+; EG-NEXT:     CNDE_INT T0.W, PS, PV.Z, PV.Y,
 ; EG-NEXT:     ASHR * T1.W, KC0[2].Z, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ; EG-NEXT:     XOR_INT T0.W, PV.W, PS,
@@ -288,79 +288,78 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x
 ;
 ; EG-LABEL: fp_to_uint_v2f32_to_v2i64:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 75, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 74, @4, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    ALU clause starting at 4:
 ; EG-NEXT:     MOV * T0.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT * T1.W, KC0[2].W, literal.x, PV.W,
-; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T0.Z, KC0[2].W, literal.x,
-; EG-NEXT:     BFE_UINT T0.W, KC0[3].X, literal.y, T0.W,
-; EG-NEXT:     ADD_INT * T2.W, PV.W, literal.z,
-; EG-NEXT:    8388607(1.175494e-38), 23(3.222986e-44)
+; EG-NEXT:     BFE_UINT T0.Z, KC0[3].X, literal.x, PV.W,
+; EG-NEXT:     BFE_UINT T0.W, KC0[2].W, literal.x, PV.W,
+; EG-NEXT:     AND_INT * T1.Z, KC0[2].W, literal.y,
+; EG-NEXT:    23(3.222986e-44), 8388607(1.175494e-38)
+; EG-NEXT:     ADD_INT T1.W, PV.W, literal.x,
+; EG-NEXT:     ADD_INT * T2.W, PV.Z, literal.x,
 ; EG-NEXT:    -150(nan), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T0.X, literal.x, PV.W,
-; EG-NEXT:     SUB_INT T0.Y, literal.x, T1.W,
-; EG-NEXT:     AND_INT T1.Z, PS, literal.y,
-; EG-NEXT:     OR_INT T3.W, PV.Z, literal.z,
+; EG-NEXT:     AND_INT T0.X, PS, literal.x,
+; EG-NEXT:     AND_INT T0.Y, PV.W, literal.x,
+; EG-NEXT:     OR_INT T1.Z, T1.Z, literal.y,
+; EG-NEXT:     SUB_INT T3.W, literal.z, T0.W,
 ; EG-NEXT:     AND_INT * T4.W, KC0[3].X, literal.w,
-; EG-NEXT:    150(2.101948e-43), 31(4.344025e-44)
-; EG-NEXT:    8388608(1.175494e-38), 8388607(1.175494e-38)
+; EG-NEXT:    31(4.344025e-44), 8388608(1.175494e-38)
+; EG-NEXT:    150(2.101948e-43), 8388607(1.175494e-38)
 ; EG-NEXT:     OR_INT T1.X, PS, literal.x,
-; EG-NEXT:     LSHL T1.Y, PV.W, PV.Z,
-; EG-NEXT:     AND_INT T0.Z, T2.W, literal.y,
-; EG-NEXT:     BIT_ALIGN_INT T4.W, 0.0, PV.W, PV.Y,
-; EG-NEXT:     AND_INT * T5.W, PV.Y, literal.y,
+; EG-NEXT:     AND_INT T1.Y, PV.W, literal.y,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, 0.0, PV.Z, PV.W,
+; EG-NEXT:     LSHL T3.W, PV.Z, PV.Y,
+; EG-NEXT:     AND_INT * T4.W, T1.W, literal.y,
 ; EG-NEXT:    8388608(1.175494e-38), 32(4.484155e-44)
-; EG-NEXT:     CNDE_INT T2.X, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T0.Y, PV.Z, PV.Y, 0.0,
-; EG-NEXT:     ADD_INT T1.Z, T0.W, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T4.W, 0.0, PV.X, T0.X,
-; EG-NEXT:     AND_INT * T5.W, T0.X, literal.y,
-; EG-NEXT:    -150(nan), 32(4.484155e-44)
+; EG-NEXT:     CNDE_INT T0.Y, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T2.Z, PV.Y, PV.Z, 0.0,
+; EG-NEXT:     LSHL T5.W, PV.X, T0.X,
+; EG-NEXT:     AND_INT * T6.W, T2.W, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     CNDE_INT T0.X, PS, PV.W, 0.0,
-; EG-NEXT:     NOT_INT T2.Y, T2.W,
-; EG-NEXT:     AND_INT T2.Z, PV.Z, literal.x,
-; EG-NEXT:     NOT_INT T2.W, PV.Z,
-; EG-NEXT:     LSHR * T4.W, T1.X, 1,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T3.X, T3.W, 1,
-; EG-NEXT:     ADD_INT T3.Y, T0.W, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     BIT_ALIGN_INT T3.Z, 0.0, PS, PV.W,
-; EG-NEXT:     LSHL T0.W, T1.X, PV.Z,
-; EG-NEXT:     AND_INT * T2.W, T1.Z, literal.y,
+; EG-NEXT:     NOT_INT T1.Y, T1.W,
+; EG-NEXT:     SUB_INT T3.Z, literal.x, T0.Z,
+; EG-NEXT:     NOT_INT T1.W, T2.W, BS:VEC_120/SCL_212
+; EG-NEXT:     LSHR * T2.W, T1.X, 1,
+; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
+; EG-NEXT:     LSHR T2.X, T1.Z, 1,
+; EG-NEXT:     ADD_INT T2.Y, T0.Z, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, PS, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, 0.0, T1.X, PV.Z,
+; EG-NEXT:     AND_INT * T2.W, PV.Z, literal.y,
 ; EG-NEXT:    -127(nan), 32(4.484155e-44)
 ; EG-NEXT:     CNDE_INT T1.X, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T4.Y, PS, PV.Z, PV.W,
-; EG-NEXT:     SETGT_INT T1.Z, PV.Y, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, PV.X, T2.Y,
-; EG-NEXT:     ADD_INT * T1.W, T1.W, literal.y,
+; EG-NEXT:     CNDE_INT T3.Y, T6.W, PV.Z, T5.W, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGT_INT T0.Z, PV.Y, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T1.W, 0.0, PV.X, T1.Y,
+; EG-NEXT:     ADD_INT * T0.W, T0.W, literal.y,
 ; EG-NEXT:    23(3.222986e-44), -127(nan)
-; EG-NEXT:     CNDE_INT T3.X, T0.Z, PV.W, T1.Y,
+; EG-NEXT:     CNDE_INT T2.X, T4.W, PV.W, T3.W,
 ; EG-NEXT:     SETGT_INT T1.Y, PS, literal.x,
-; EG-NEXT:     CNDE_INT T0.Z, PV.Z, 0.0, PV.Y,
-; EG-NEXT:     CNDE_INT T0.W, PV.Z, T0.X, PV.X,
+; EG-NEXT:     CNDE_INT T1.Z, PV.Z, 0.0, PV.Y,
+; EG-NEXT:     CNDE_INT T1.W, PV.Z, PV.X, T0.X,
 ; EG-NEXT:     ASHR * T2.W, KC0[3].X, literal.y,
 ; EG-NEXT:    23(3.222986e-44), 31(4.344025e-44)
 ; EG-NEXT:     XOR_INT T0.X, PV.W, PS,
-; EG-NEXT:     XOR_INT T2.Y, PV.Z, PS,
+; EG-NEXT:     XOR_INT T3.Y, PV.Z, PS,
 ; EG-NEXT:     CNDE_INT T0.Z, PV.Y, 0.0, PV.X,
-; EG-NEXT:     CNDE_INT T0.W, PV.Y, T2.X, T0.Y,
+; EG-NEXT:     CNDE_INT T1.W, PV.Y, T2.Z, T0.Y,
 ; EG-NEXT:     ASHR * T3.W, KC0[2].W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
 ; EG-NEXT:     XOR_INT T0.Y, PV.W, PS,
 ; EG-NEXT:     XOR_INT T0.Z, PV.Z, PS,
-; EG-NEXT:     SUB_INT T0.W, PV.Y, T2.W,
+; EG-NEXT:     SUB_INT T1.W, PV.Y, T2.W,
 ; EG-NEXT:     SUBB_UINT * T4.W, PV.X, T2.W,
 ; EG-NEXT:     SUB_INT T1.Y, PV.W, PS,
-; EG-NEXT:     SETGT_INT T1.Z, 0.0, T3.Y,
-; EG-NEXT:     SUB_INT T0.W, PV.Z, T3.W,
+; EG-NEXT:     SETGT_INT T1.Z, 0.0, T2.Y,
+; EG-NEXT:     SUB_INT T1.W, PV.Z, T3.W,
 ; EG-NEXT:     SUBB_UINT * T4.W, PV.Y, T3.W,
 ; EG-NEXT:     SUB_INT T0.Z, PV.W, PS,
-; EG-NEXT:     SETGT_INT T0.W, 0.0, T1.W,
+; EG-NEXT:     SETGT_INT T0.W, 0.0, T0.W,
 ; EG-NEXT:     CNDE_INT * T1.W, PV.Z, PV.Y, 0.0,
 ; EG-NEXT:     CNDE_INT T1.Y, PV.W, PV.Z, 0.0,
 ; EG-NEXT:     SUB_INT * T2.W, T0.X, T2.W,
@@ -449,170 +448,168 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x
 ;
 ; EG-LABEL: fp_to_uint_v4f32_to_v4i64:
 ; EG:       ; %bb.0:
-; EG-NEXT:    ALU 101, @6, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    ALU 54, @108, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T2.X, 1
+; EG-NEXT:    ALU 99, @6, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 54, @106, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T0.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    ALU clause starting at 6:
 ; EG-NEXT:     MOV * T0.W, literal.x,
 ; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T1.W, KC0[4].X, literal.x, PV.W,
-; EG-NEXT:     AND_INT * T2.W, KC0[4].X, literal.y,
+; EG-NEXT:     BFE_UINT T1.W, KC0[3].Z, literal.x, PV.W,
+; EG-NEXT:     AND_INT * T2.W, KC0[3].Z, literal.y,
 ; EG-NEXT:    23(3.222986e-44), 8388607(1.175494e-38)
-; EG-NEXT:     OR_INT T0.Z, PS, literal.x,
-; EG-NEXT:     BFE_UINT T2.W, KC0[3].Z, literal.y, T0.W,
-; EG-NEXT:     ADD_INT * T3.W, PV.W, literal.z,
-; EG-NEXT:    8388608(1.175494e-38), 23(3.222986e-44)
-; EG-NEXT:    -150(nan), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT T0.Y, PV.W, literal.x,
-; EG-NEXT:     AND_INT T1.Z, PS, literal.y,
-; EG-NEXT:     NOT_INT T4.W, PS,
-; EG-NEXT:     LSHR * T5.W, PV.Z, 1,
-; EG-NEXT:    -127(nan), 31(4.344025e-44)
+; EG-NEXT:     OR_INT T2.W, PS, literal.x,
+; EG-NEXT:     ADD_INT * T3.W, PV.W, literal.y,
+; EG-NEXT:    8388608(1.175494e-38), -150(nan)
 ; EG-NEXT:     ADD_INT T0.X, T1.W, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T1.Y, 0.0, PS, PV.W,
-; EG-NEXT:     AND_INT T2.Z, T3.W, literal.y, BS:VEC_201
-; EG-NEXT:     LSHL T3.W, T0.Z, PV.Z,
-; EG-NEXT:     SUB_INT * T1.W, literal.z, T1.W,
-; EG-NEXT:    -127(nan), 32(4.484155e-44)
-; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.X, PS, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T2.Y, 0.0, T0.Z, PS,
-; EG-NEXT:     AND_INT T0.Z, KC0[3].Z, literal.y,
-; EG-NEXT:     CNDE_INT T1.W, PV.Z, PV.Y, PV.W,
-; EG-NEXT:     SETGT_INT * T4.W, PV.X, literal.z,
+; EG-NEXT:     BFE_UINT T0.Y, KC0[4].X, literal.y, T0.W,
+; EG-NEXT:     AND_INT T0.Z, PS, literal.z,
+; EG-NEXT:     NOT_INT T4.W, PS,
+; EG-NEXT:     LSHR * T5.W, PV.W, 1,
+; EG-NEXT:    -127(nan), 23(3.222986e-44)
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BIT_ALIGN_INT T1.X, 0.0, PS, PV.W,
+; EG-NEXT:     AND_INT T1.Y, T3.W, literal.x,
+; EG-NEXT:     LSHL T0.Z, T2.W, PV.Z, BS:VEC_120/SCL_212
+; EG-NEXT:     AND_INT T3.W, KC0[4].X, literal.y,
+; EG-NEXT:     ADD_INT * T4.W, PV.Y, literal.z,
 ; EG-NEXT:    32(4.484155e-44), 8388607(1.175494e-38)
+; EG-NEXT:    -150(nan), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T2.Y, PS, literal.x,
+; EG-NEXT:     OR_INT T1.Z, PV.W, literal.y,
+; EG-NEXT:     CNDE_INT T3.W, PV.Y, PV.X, PV.Z,
+; EG-NEXT:     SETGT_INT * T5.W, T0.X, literal.z,
+; EG-NEXT:    31(4.344025e-44), 8388608(1.175494e-38)
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T2.X, PS, 0.0, PV.W,
-; EG-NEXT:     OR_INT T1.Y, PV.Z, literal.x,
-; EG-NEXT:     ADD_INT T0.Z, T2.W, literal.y,
-; EG-NEXT:     CNDE_INT T1.W, PV.X, PV.Y, 0.0,
-; EG-NEXT:     CNDE_INT * T3.W, T2.Z, T3.W, 0.0,
-; EG-NEXT:    8388608(1.175494e-38), -150(nan)
-; EG-NEXT:     CNDE_INT T1.X, T4.W, PV.W, PS,
-; EG-NEXT:     ASHR T2.Y, KC0[4].X, literal.x,
-; EG-NEXT:     AND_INT T1.Z, PV.Z, literal.x,
-; EG-NEXT:     NOT_INT T1.W, PV.Z,
-; EG-NEXT:     LSHR * T3.W, PV.Y, 1,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.X, 0.0, PS, PV.W,
-; EG-NEXT:     LSHL T3.Y, T1.Y, PV.Z,
-; EG-NEXT:     XOR_INT T1.Z, PV.X, PV.Y,
-; EG-NEXT:     XOR_INT T1.W, T2.X, PV.Y,
-; EG-NEXT:     SUB_INT * T2.W, literal.x, T2.W,
-; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T1.X, T0.Z, literal.x,
-; EG-NEXT:     AND_INT T4.Y, PS, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, T1.Y, PS, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T1.W, PV.W, T2.Y,
-; EG-NEXT:     SUBB_UINT * T2.W, PV.Z, T2.Y,
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T2.X, PV.W, PS,
-; EG-NEXT:     CNDE_INT T1.Y, PV.Y, PV.Z, 0.0,
-; EG-NEXT:     CNDE_INT T0.Z, PV.X, T3.Y, 0.0,
-; EG-NEXT:     CNDE_INT T1.W, PV.X, T3.X, T3.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     SETGT_INT * T2.W, T0.Y, literal.x,
+; EG-NEXT:     CNDE_INT T3.Y, PS, 0.0, PV.W,
+; EG-NEXT:     SUB_INT T2.Z, literal.x, T1.W,
+; EG-NEXT:     LSHL T1.W, PV.Z, PV.Y,
+; EG-NEXT:     AND_INT * T3.W, T4.W, literal.y,
+; EG-NEXT:    150(2.101948e-43), 32(4.484155e-44)
+; EG-NEXT:     CNDE_INT T1.X, PS, PV.W, 0.0,
+; EG-NEXT:     AND_INT T2.Y, PV.Z, literal.x,
+; EG-NEXT:     SUB_INT T3.Z, literal.y, T0.Y,
+; EG-NEXT:     NOT_INT T4.W, T4.W,
+; EG-NEXT:     LSHR * T6.W, T1.Z, 1,
+; EG-NEXT:    32(4.484155e-44), 150(2.101948e-43)
+; EG-NEXT:     BIT_ALIGN_INT T2.X, 0.0, T2.W, T2.Z,
+; EG-NEXT:     ADD_INT T0.Y, T0.Y, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, 0.0, PS, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T2.W, 0.0, T1.Z, PV.Z,
+; EG-NEXT:     AND_INT * T4.W, PV.Z, literal.y,
+; EG-NEXT:    -127(nan), 32(4.484155e-44)
+; EG-NEXT:     CNDE_INT T3.X, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T4.Y, T3.W, PV.Z, T1.W,
+; EG-NEXT:     SETGT_INT T1.Z, PV.Y, literal.x,
+; EG-NEXT:     CNDE_INT T1.W, T1.Y, T0.Z, 0.0,
+; EG-NEXT:     CNDE_INT * T2.W, T2.Y, PV.X, 0.0,
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T1.X, KC0[3].W, literal.x, T0.W,
-; EG-NEXT:     AND_INT T3.Y, KC0[3].W, literal.y,
-; EG-NEXT:     CNDE_INT T2.Z, PS, 0.0, PV.W,
-; EG-NEXT:     CNDE_INT T1.W, PS, PV.Y, PV.Z,
-; EG-NEXT:     ASHR * T2.W, KC0[3].Z, literal.z,
-; EG-NEXT:    23(3.222986e-44), 8388607(1.175494e-38)
+; EG-NEXT:     CNDE_INT T2.X, T5.W, PS, PV.W,
+; EG-NEXT:     ASHR T1.Y, KC0[3].Z, literal.x,
+; EG-NEXT:     CNDE_INT T0.Z, PV.Z, 0.0, PV.Y,
+; EG-NEXT:     CNDE_INT T1.W, PV.Z, PV.X, T1.X,
+; EG-NEXT:     ASHR * T2.W, KC0[4].X, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BFE_UINT T3.X, KC0[3].Y, literal.x, T0.W,
-; EG-NEXT:     XOR_INT T1.Y, PV.W, PS,
+; EG-NEXT:     XOR_INT T2.Y, PV.W, PS,
 ; EG-NEXT:     XOR_INT T0.Z, PV.Z, PS,
-; EG-NEXT:     OR_INT T0.W, PV.Y, literal.y,
-; EG-NEXT:     SUB_INT * T1.W, literal.z, PV.X,
-; EG-NEXT:    23(3.222986e-44), 8388608(1.175494e-38)
+; EG-NEXT:     XOR_INT T1.W, PV.X, PV.Y,
+; EG-NEXT:     XOR_INT * T3.W, T3.Y, PV.Y,
+; EG-NEXT:     SUB_INT T3.Y, PS, T1.Y,
+; EG-NEXT:     SUBB_UINT T1.Z, PV.W, T1.Y,
+; EG-NEXT:     SUB_INT T3.W, PV.Z, T2.W,
+; EG-NEXT:     SUBB_UINT * T4.W, PV.Y, T2.W,
+; EG-NEXT:     SUB_INT T4.Y, PV.W, PS,
+; EG-NEXT:     SUB_INT T0.Z, PV.Y, PV.Z,
+; EG-NEXT:     BFE_UINT T3.W, KC0[3].Y, literal.x, T0.W,
+; EG-NEXT:     AND_INT * T4.W, KC0[3].Y, literal.y,
+; EG-NEXT:    23(3.222986e-44), 8388607(1.175494e-38)
+; EG-NEXT:     SETGT_INT T0.X, 0.0, T0.X,
+; EG-NEXT:     ADD_INT T3.Y, PV.W, literal.x,
+; EG-NEXT:     OR_INT T1.Z, PS, literal.y,
+; EG-NEXT:     BFE_UINT T0.W, KC0[3].W, literal.z, T0.W,
+; EG-NEXT:     ADD_INT * T4.W, PV.W, literal.w,
+; EG-NEXT:    -127(nan), 8388608(1.175494e-38)
+; EG-NEXT:    23(3.222986e-44), -150(nan)
+; EG-NEXT:     AND_INT T1.X, KC0[3].W, literal.x,
+; EG-NEXT:     ADD_INT T5.Y, PV.W, literal.y,
+; EG-NEXT:     SUB_INT T2.Z, literal.z, T3.W,
+; EG-NEXT:     NOT_INT T3.W, PS,
+; EG-NEXT:     LSHR * T5.W, PV.Z, 1,
+; EG-NEXT:    8388607(1.175494e-38), -150(nan)
 ; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T4.X, KC0[3].Y, literal.x,
-; EG-NEXT:     AND_INT T3.Y, PS, literal.y,
-; EG-NEXT:     BIT_ALIGN_INT T2.Z, 0.0, PV.W, PS,
-; EG-NEXT:     SUB_INT T1.W, PV.Z, T2.W,
-; EG-NEXT:     SUBB_UINT * T3.W, PV.Y, T2.W,
-; EG-NEXT:    8388607(1.175494e-38), 32(4.484155e-44)
-; EG-NEXT:     SUB_INT T5.X, PV.W, PS,
-; EG-NEXT:     SETGT_INT T0.Y, 0.0, T0.Y,
-; EG-NEXT:     CNDE_INT T0.Z, PV.Y, PV.Z, 0.0,
-; EG-NEXT:     OR_INT T1.W, PV.X, literal.x,
-; EG-NEXT:     ADD_INT * T3.W, T3.X, literal.y,
-; EG-NEXT:    8388608(1.175494e-38), -150(nan)
-; EG-NEXT:     ADD_INT T4.X, T3.X, literal.x,
-; EG-NEXT:     SUB_INT T3.Y, literal.y, T3.X,
-; EG-NEXT:     AND_INT T2.Z, PS, literal.z,
-; EG-NEXT:     NOT_INT T4.W, PS,
-; EG-NEXT:     LSHR * T5.W, PV.W, 1,
-; EG-NEXT:    -127(nan), 150(2.101948e-43)
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.X, 0.0, PS, PV.W,
-; EG-NEXT:     LSHL T4.Y, T1.W, PV.Z,
-; EG-NEXT:     AND_INT T2.Z, T3.W, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     BIT_ALIGN_INT T1.W, 0.0, T1.W, PV.Y, BS:VEC_021/SCL_122
-; EG-NEXT:     AND_INT * T3.W, PV.Y, literal.x,
+; EG-NEXT:     BIT_ALIGN_INT T2.X, 0.0, PS, PV.W,
+; EG-NEXT:     AND_INT T6.Y, PV.Z, literal.x,
+; EG-NEXT:     AND_INT T3.Z, PV.Y, literal.y,
+; EG-NEXT:     OR_INT T3.W, PV.X, literal.z,
+; EG-NEXT:     AND_INT * T5.W, T4.W, literal.y,
+; EG-NEXT:    32(4.484155e-44), 31(4.344025e-44)
+; EG-NEXT:    8388608(1.175494e-38), 0(0.000000e+00)
+; EG-NEXT:     BIT_ALIGN_INT T1.X, 0.0, T1.Z, T2.Z,
+; EG-NEXT:     LSHL T7.Y, T1.Z, PS,
+; EG-NEXT:     AND_INT T1.Z, T4.W, literal.x,
+; EG-NEXT:     LSHL T4.W, PV.W, PV.Z,
+; EG-NEXT:     AND_INT * T5.W, T5.Y, literal.x,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT T6.X, T1.X, literal.x,
-; EG-NEXT:     CNDE_INT T3.Y, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT * T3.Z, PV.Z, PV.Y, 0.0,
-; EG-NEXT:    -150(nan), 0(0.000000e+00)
-; EG-NEXT:    ALU clause starting at 108:
-; EG-NEXT:     CNDE_INT T1.W, T2.Z, T3.X, T4.Y,
-; EG-NEXT:     SETGT_INT * T3.W, T4.X, literal.x,
+; EG-NEXT:     CNDE_INT T3.X, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T8.Y, PV.Z, PV.Y, 0.0,
+; EG-NEXT:     CNDE_INT * T2.Z, T6.Y, PV.X, 0.0,
+; EG-NEXT:    ALU clause starting at 106:
+; EG-NEXT:     CNDE_INT T6.W, T1.Z, T2.X, T7.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SETGT_INT * T7.W, T3.Y, literal.x,
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T3.X, PS, 0.0, PV.W,
-; EG-NEXT:     CNDE_INT T3.Y, PS, T3.Y, T3.Z,
-; EG-NEXT:     AND_INT T2.Z, T6.X, literal.x,
-; EG-NEXT:     NOT_INT T1.W, T6.X,
-; EG-NEXT:     LSHR * T3.W, T0.W, 1,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     ASHR T7.X, KC0[3].Y, literal.x,
-; EG-NEXT:     ADD_INT T4.Y, T1.X, literal.y,
-; EG-NEXT:     BIT_ALIGN_INT T3.Z, 0.0, PS, PV.W,
-; EG-NEXT:     LSHL T0.W, T0.W, PV.Z,
-; EG-NEXT:     AND_INT * T1.W, T6.X, literal.z,
+; EG-NEXT:     CNDE_INT T1.X, PS, 0.0, PV.W,
+; EG-NEXT:     CNDE_INT T6.Y, PS, T2.Z, T8.Y,
+; EG-NEXT:     SUB_INT T1.Z, literal.x, T0.W,
+; EG-NEXT:     NOT_INT T6.W, T5.Y,
+; EG-NEXT:     LSHR * T7.W, T3.W, 1,
+; EG-NEXT:    150(2.101948e-43), 0(0.000000e+00)
+; EG-NEXT:     ASHR T2.X, KC0[3].Y, literal.x,
+; EG-NEXT:     ADD_INT T5.Y, T0.W, literal.y,
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, 0.0, PS, PV.W,
+; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, T3.W, PV.Z,
+; EG-NEXT:     AND_INT * T3.W, PV.Z, literal.z,
 ; EG-NEXT:    31(4.344025e-44), -127(nan)
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T1.X, PS, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T5.Y, PS, PV.Z, PV.W,
-; EG-NEXT:     SETGT_INT T2.Z, PV.Y, literal.x,
-; EG-NEXT:     XOR_INT T0.W, T3.Y, PV.X,
-; EG-NEXT:     XOR_INT * T1.W, T3.X, PV.X,
+; EG-NEXT:     CNDE_INT T4.X, PS, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T7.Y, T5.W, PV.Z, T4.W,
+; EG-NEXT:     SETGT_INT T1.Z, PV.Y, literal.x,
+; EG-NEXT:     XOR_INT T0.W, T6.Y, PV.X,
+; EG-NEXT:     XOR_INT * T3.W, T1.X, PV.X,
 ; EG-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; EG-NEXT:     SUB_INT T3.X, PS, T7.X,
-; EG-NEXT:     SUBB_UINT T3.Y, PV.W, T7.X,
-; EG-NEXT:     CNDE_INT T3.Z, PV.Z, 0.0, PV.Y,
-; EG-NEXT:     CNDE_INT T1.W, PV.Z, T0.Z, PV.X,
-; EG-NEXT:     ASHR * T3.W, KC0[3].W, literal.x,
+; EG-NEXT:     SUB_INT T1.X, PS, T2.X,
+; EG-NEXT:     SUBB_UINT T6.Y, PV.W, T2.X,
+; EG-NEXT:     CNDE_INT T2.Z, PV.Z, 0.0, PV.Y,
+; EG-NEXT:     CNDE_INT T3.W, PV.Z, PV.X, T3.X,
+; EG-NEXT:     ASHR * T4.W, KC0[3].W, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     XOR_INT T1.X, PV.W, PS,
-; EG-NEXT:     XOR_INT T5.Y, PV.Z, PS,
-; EG-NEXT:     SUB_INT T0.Z, PV.X, PV.Y,
-; EG-NEXT:     SETGT_INT T1.W, 0.0, T4.X, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T6.W, T0.Y, T5.X, 0.0,
-; EG-NEXT:     SETGT_INT T0.X, 0.0, T0.X,
+; EG-NEXT:     XOR_INT T3.X, PV.W, PS,
+; EG-NEXT:     XOR_INT T7.Y, PV.Z, PS,
+; EG-NEXT:     SUB_INT T1.Z, PV.X, PV.Y,
+; EG-NEXT:     SETGT_INT T3.W, 0.0, T3.Y,
+; EG-NEXT:     CNDE_INT * T6.W, T0.X, T0.Z, 0.0,
+; EG-NEXT:     SETGT_INT T1.X, 0.0, T0.Y,
 ; EG-NEXT:     CNDE_INT T6.Y, PV.W, PV.Z, 0.0,
-; EG-NEXT:     SUB_INT T0.Z, T1.Y, T2.W, BS:VEC_021/SCL_122
-; EG-NEXT:     SUB_INT T2.W, PV.Y, T3.W,
-; EG-NEXT:     SUBB_UINT * T4.W, PV.X, T3.W,
-; EG-NEXT:     SUB_INT T3.X, PV.W, PS,
-; EG-NEXT:     SETGT_INT T1.Y, 0.0, T4.Y,
-; EG-NEXT:     CNDE_INT T6.Z, T0.Y, PV.Z, 0.0,
-; EG-NEXT:     SUB_INT T0.W, T0.W, T7.X, BS:VEC_021/SCL_122
-; EG-NEXT:     CNDE_INT * T4.W, PV.X, T2.X, 0.0,
-; EG-NEXT:     CNDE_INT T6.X, T1.W, PV.W, 0.0,
-; EG-NEXT:     CNDE_INT T4.Y, PV.Y, PV.X, 0.0,
-; EG-NEXT:     SUB_INT T0.W, T1.Z, T2.Y,
-; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
+; EG-NEXT:     SUB_INT T0.Z, T1.W, T1.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     SUB_INT T1.W, PV.Y, T4.W,
+; EG-NEXT:     SUBB_UINT * T5.W, PV.X, T4.W,
+; EG-NEXT:     SUB_INT T4.X, PV.W, PS,
+; EG-NEXT:     SETGT_INT T0.Y, 0.0, T5.Y, BS:VEC_021/SCL_122
+; EG-NEXT:     CNDE_INT T6.Z, T0.X, PV.Z, 0.0,
+; EG-NEXT:     SUB_INT T0.W, T0.W, T2.X,
+; EG-NEXT:     CNDE_INT * T1.W, PV.X, T4.Y, 0.0,
+; EG-NEXT:     CNDE_INT T6.X, T3.W, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T1.Y, PV.Y, PV.X, 0.0,
+; EG-NEXT:     SUB_INT T0.W, T2.Y, T2.W,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T4.Z, T0.X, PV.W, 0.0,
-; EG-NEXT:     SUB_INT * T0.W, T1.X, T3.W, BS:VEC_120/SCL_212
-; EG-NEXT:     CNDE_INT T4.X, T1.Y, PV.W, 0.0,
+; EG-NEXT:     CNDE_INT T1.Z, T1.X, PV.W, 0.0,
+; EG-NEXT:     SUB_INT * T0.W, T3.X, T4.W, BS:VEC_120/SCL_212
+; EG-NEXT:     CNDE_INT T1.X, T0.Y, PV.W, 0.0,
 ; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T0.X, PV.W, literal.x,
+; EG-NEXT:     LSHR * T2.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %conv = fptoui <4 x float> %x to <4 x i64>
   store <4 x i64> %conv, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/kernel_code_t_recurse.ll b/llvm/test/CodeGen/AMDGPU/kernel_code_t_recurse.ll
new file mode 100644
index 000000000000..cdd6e88dd103
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/kernel_code_t_recurse.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d < %s | FileCheck %s
+
+; CHECK-LABEL: non_kernel_recursion:
+define void @non_kernel_recursion(i32 %val) #2 {
+  %cmp = icmp eq i32 %val, 0
+  br i1 %cmp, label %ret, label %call
+
+call:
+  %val.sub1 = sub i32 %val, 1
+  call void @non_kernel_recursion(i32 %val.sub1)
+  br label %ret
+
+ret:
+  ret void
+}
+
+; CHECK-LABEL: kernel_caller_recursion:
+; CHECK: .amd_kernel_code_t
+; CHECK: is_dynamic_callstack = 1
+; CHECK: .end_amd_kernel_code_t
+define amdgpu_kernel void @kernel_caller_recursion(i32 %n) #0 {
+  call void @non_kernel_recursion(i32 %n)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll
new file mode 100644
index 000000000000..4927c2ffcdf3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefix=SDAG
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefix=GFX9-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefix=SDAG
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefix=GFX10-GISEL
+
+define amdgpu_ps void @test(ptr addrspace(1) inreg %ptr) {
+; SDAG-LABEL: test:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_mov_b32 s2, src_pops_exiting_wave_id
+; SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: test:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_mov_b32 s2, src_pops_exiting_wave_id
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX10-GISEL-LABEL: test:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_mov_b32 s2, src_pops_exiting_wave_id
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-GISEL-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.pops.exiting.wave.id()
+  store i32 %id, ptr addrspace(1) %ptr
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 7a0450761e1f..3a867879bb80 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -228,23 +228,23 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) {
 ; R600-NEXT:     MUL_IEEE * T2.W, PS, literal.z,
 ; R600-NEXT:    -127(nan), 254(3.559298e-43)
 ; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T3.X, T1.X, literal.x,
-; R600-NEXT:     MUL_IEEE T0.Y, PS, literal.y,
+; R600-NEXT:     MUL_IEEE T3.X, PS, literal.x,
+; R600-NEXT:     MUL_IEEE T0.Y, T1.X, literal.y,
 ; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Z, T0.Z,
 ; R600-NEXT:     CNDE_INT T3.W, PV.Y, PV.X, T0.X,
 ; R600-NEXT:     SETGT_INT * T4.W, T0.Z, literal.z,
-; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
+; R600-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
 ; R600-NEXT:    127(1.779649e-43), 0(0.000000e+00)
 ; R600-NEXT:     CNDE_INT T0.Z, PS, PV.Z, PV.W,
-; R600-NEXT:     CNDE_INT T0.W, T0.W, PV.Y, T2.W,
-; R600-NEXT:     MUL_IEEE * T2.W, PV.X, literal.x,
+; R600-NEXT:     MUL_IEEE T3.W, PV.Y, literal.x,
+; R600-NEXT:     CNDE_INT * T0.W, T0.W, PV.X, T2.W,
 ; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T1.Z, T1.Y, T3.X, PS,
-; R600-NEXT:     CNDE_INT T0.W, T1.W, PV.W, T1.X,
+; R600-NEXT:     CNDE_INT T1.Z, T1.W, PS, T1.X,
+; R600-NEXT:     CNDE_INT T0.W, T1.Y, T0.Y, PV.W,
 ; R600-NEXT:     LSHL * T1.W, PV.Z, literal.x,
 ; R600-NEXT:    23(3.222986e-44), 0(0.000000e+00)
 ; R600-NEXT:     ADD_INT T1.W, PS, literal.x,
-; R600-NEXT:     CNDE_INT * T0.W, T4.W, PV.W, PV.Z,
+; R600-NEXT:     CNDE_INT * T0.W, T4.W, PV.Z, PV.W,
 ; R600-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
 ; R600-NEXT:     MUL_IEEE T0.W, PS, PV.W,
 ; R600-NEXT:     SETGT * T1.W, literal.x, KC0[2].Z,
@@ -258,65 +258,63 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) {
 ;
 ; CM-LABEL: s_exp_f32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 64, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 62, @4, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
 ; CM-NEXT:    ALU clause starting at 4:
 ; CM-NEXT:     AND_INT * T0.W, KC0[2].Z, literal.x,
 ; CM-NEXT:    -4096(nan), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.Z, PV.W, literal.x,
 ; CM-NEXT:     ADD * T1.W, KC0[2].Z, -PV.W,
-; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T1.Z, PV.W, literal.x,
-; CM-NEXT:     RNDNE * T2.W, PV.Z,
-; CM-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; CM-NEXT:     TRUNC T2.Z, PV.W,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.W, literal.x,
+; CM-NEXT:     MUL_IEEE * T2.W, T0.W, literal.y,
+; CM-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
+; CM-NEXT:     RNDNE T1.Z, PV.W,
 ; CM-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.x, PV.Z,
 ; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; CM-NEXT:     MULADD_IEEE T0.Y, T0.W, literal.x, PV.W,
-; CM-NEXT:     ADD T0.Z, T0.Z, -T2.W,
-; CM-NEXT:     FLT_TO_INT * T0.W, PV.Z,
+; CM-NEXT:     MULADD_IEEE T0.Z, T0.W, literal.x, PV.W,
+; CM-NEXT:     ADD * T0.W, T2.W, -PV.Z, BS:VEC_120/SCL_212
 ; CM-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; CM-NEXT:     MIN_INT T1.Z, PV.W, literal.x,
-; CM-NEXT:     ADD * T1.W, PV.Z, PV.Y,
+; CM-NEXT:     TRUNC T1.Z, T1.Z,
+; CM-NEXT:     ADD * T0.W, PV.W, PV.Z,
+; CM-NEXT:     EXP_IEEE T0.X, T0.W,
+; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
+; CM-NEXT:     FLT_TO_INT T0.Z, T1.Z,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T0.Y, PV.W, literal.x,
+; CM-NEXT:     MAX_INT T1.Z, PV.Z, literal.y,
+; CM-NEXT:     MIN_INT * T1.W, PV.Z, literal.z,
+; CM-NEXT:    209715200(1.972152e-31), -330(nan)
 ; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
-; CM-NEXT:     EXP_IEEE T0.X, T1.W,
-; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
-; CM-NEXT:     MUL_IEEE T0.Y, PV.X, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, T1.Z, literal.y,
-; CM-NEXT:     MAX_INT * T1.W, T0.W, literal.z,
-; CM-NEXT:    2130706432(1.701412e+38), -254(nan)
-; CM-NEXT:    -330(nan), 0(0.000000e+00)
-; CM-NEXT:     ADD_INT T1.X, T0.W, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, PV.W, literal.y,
-; CM-NEXT:     ADD_INT T1.Z, T0.W, literal.z,
-; CM-NEXT:     SETGT_UINT * T1.W, T0.W, literal.w,
-; CM-NEXT:    -127(nan), 204(2.858649e-43)
+; CM-NEXT:     ADD_INT T1.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T1.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T1.Z, T0.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T1.W, T0.Z, literal.w,
+; CM-NEXT:    -254(nan), 204(2.858649e-43)
 ; CM-NEXT:    102(1.429324e-43), -229(nan)
-; CM-NEXT:     SETGT_UINT T2.X, T0.W, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     SETGT_INT T1.Z, T0.W, literal.y,
-; CM-NEXT:     MUL_IEEE * T2.W, T0.X, literal.z,
-; CM-NEXT:    254(3.559298e-43), -127(nan)
-; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T3.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.Z, PV.Y, T0.W,
-; CM-NEXT:     CNDE_INT T0.Z, PV.X, T1.X, T0.Z,
-; CM-NEXT:     SETGT_INT * T0.W, T0.W, literal.y,
-; CM-NEXT:    209715200(1.972152e-31), 127(1.779649e-43)
+; CM-NEXT:     ADD_INT T2.X, T0.Z, literal.x,
+; CM-NEXT:     SETGT_UINT T2.Y, T0.Z, literal.y,
+; CM-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     SETGT_INT * T2.W, T0.Z, literal.x,
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     MUL_IEEE T3.X, T0.X, literal.x,
+; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Z, T0.Z,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Y, PV.X, T1.X,
+; CM-NEXT:     SETGT_INT * T3.W, T0.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 127(1.779649e-43)
 ; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     CNDE_INT T0.Z, T1.W, PV.X, T2.W,
-; CM-NEXT:     MUL_IEEE * T1.W, T0.Y, literal.x,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.X, literal.x,
+; CM-NEXT:     CNDE_INT * T0.W, T1.W, T0.Y, T0.W,
 ; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T0.Y, T2.X, T0.Y, PV.W,
-; CM-NEXT:     CNDE_INT T0.Z, T1.Z, PV.Z, T0.X,
-; CM-NEXT:     LSHL * T1.W, PV.Y, literal.x,
+; CM-NEXT:     CNDE_INT T0.Y, T2.W, PV.W, T0.X,
+; CM-NEXT:     CNDE_INT T0.Z, T2.Y, T3.X, PV.Z,
+; CM-NEXT:     LSHL * T0.W, PV.Y, literal.x,
 ; CM-NEXT:    23(3.222986e-44), 0(0.000000e+00)
 ; CM-NEXT:     ADD_INT T1.Z, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT * T0.W, T0.W, PV.Z, PV.Y,
+; CM-NEXT:     CNDE_INT * T0.W, T3.W, PV.Y, PV.Z,
 ; CM-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
 ; CM-NEXT:     MUL_IEEE T0.Z, PV.W, PV.Z,
 ; CM-NEXT:     SETGT * T0.W, literal.x, KC0[2].Z,
@@ -610,105 +608,105 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; R600-NEXT:     AND_INT * T0.W, KC0[3].X, literal.x,
 ; R600-NEXT:    -4096(nan), 0(0.000000e+00)
 ; R600-NEXT:     ADD * T1.W, KC0[3].X, -PV.W,
-; R600-NEXT:     AND_INT T0.Z, KC0[2].W, literal.x,
-; R600-NEXT:     MUL_IEEE T2.W, PV.W, literal.y,
-; R600-NEXT:     MUL_IEEE * T3.W, T0.W, literal.z,
-; R600-NEXT:    -4096(nan), 967029397(3.122284e-04)
-; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; R600-NEXT:     RNDNE T1.Z, PS,
+; R600-NEXT:     MUL_IEEE T2.W, PV.W, literal.x,
+; R600-NEXT:     MUL_IEEE * T3.W, T0.W, literal.y,
+; R600-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
+; R600-NEXT:     RNDNE T0.Z, PS,
 ; R600-NEXT:     MULADD_IEEE T1.W, T1.W, literal.x, PV.W,
-; R600-NEXT:     ADD * T2.W, KC0[2].W, -PV.Z,
-; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T0.Y, PS, literal.x,
-; R600-NEXT:     MUL_IEEE T2.Z, T0.Z, literal.y,
+; R600-NEXT:     AND_INT * T2.W, KC0[2].W, literal.y,
+; R600-NEXT:    1069064192(1.442383e+00), -4096(nan)
+; R600-NEXT:     ADD T1.Z, KC0[2].W, -PS,
 ; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.x, PV.W,
 ; R600-NEXT:     ADD * T1.W, T3.W, -PV.Z,
+; R600-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
+; R600-NEXT:     ADD T2.Z, PS, PV.W,
+; R600-NEXT:     MUL_IEEE T0.W, PV.Z, literal.x,
+; R600-NEXT:     MUL_IEEE * T1.W, T2.W, literal.y,
 ; R600-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
-; R600-NEXT:     ADD T3.Z, PS, PV.W,
-; R600-NEXT:     RNDNE T0.W, PV.Z,
-; R600-NEXT:     MULADD_IEEE * T1.W, T2.W, literal.x, PV.Y, BS:VEC_021/SCL_122
-; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; R600-NEXT:     TRUNC T0.Y, T1.Z,
-; R600-NEXT:     MULADD_IEEE T0.Z, T0.Z, literal.x, PS, BS:VEC_120/SCL_212
-; R600-NEXT:     ADD T1.W, T2.Z, -PV.W, BS:VEC_201
+; R600-NEXT:     RNDNE T0.Y, PS,
+; R600-NEXT:     MULADD_IEEE T1.Z, T1.Z, literal.x, PV.W,
+; R600-NEXT:     TRUNC T0.W, T0.Z, BS:VEC_120/SCL_212
 ; R600-NEXT:     EXP_IEEE * T0.X, PV.Z,
-; R600-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; R600-NEXT:     ADD T0.Z, PV.W, PV.Z,
-; R600-NEXT:     FLT_TO_INT T1.W, PV.Y,
-; R600-NEXT:     MUL_IEEE * T2.W, PS, literal.x,
-; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T1.Z, PS, literal.x,
-; R600-NEXT:     SETGT_UINT T3.W, PV.W, literal.y,
-; R600-NEXT:     EXP_IEEE * T0.Y, PV.Z,
-; R600-NEXT:    2130706432(1.701412e+38), 254(3.559298e-43)
-; R600-NEXT:     CNDE_INT T1.X, PV.W, T2.W, PV.Z,
-; R600-NEXT:     MUL_IEEE T1.Y, PS, literal.x,
-; R600-NEXT:     MAX_INT T0.Z, T1.W, literal.y,
-; R600-NEXT:     MIN_INT T2.W, T1.W, literal.z,
-; R600-NEXT:     TRUNC * T0.W, T0.W,
+; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
+; R600-NEXT:     FLT_TO_INT T1.Y, PV.W,
+; R600-NEXT:     MUL_IEEE T0.Z, PS, literal.x,
+; R600-NEXT:     MULADD_IEEE T0.W, T2.W, literal.y, PV.Z,
+; R600-NEXT:     ADD * T1.W, T1.W, -PV.Y,
+; R600-NEXT:    209715200(1.972152e-31), 967029397(3.122284e-04)
+; R600-NEXT:     ADD T1.Z, PS, PV.W,
+; R600-NEXT:     MUL_IEEE T0.W, PV.Z, literal.x,
+; R600-NEXT:     SETGT_UINT * T1.W, PV.Y, literal.y,
+; R600-NEXT:    209715200(1.972152e-31), -229(nan)
+; R600-NEXT:     CNDE_INT T0.Z, PS, PV.W, T0.Z,
+; R600-NEXT:     SETGT_INT T0.W, T1.Y, literal.x,
+; R600-NEXT:     EXP_IEEE * T1.X, PV.Z,
+; R600-NEXT:    -127(nan), 0(0.000000e+00)
+; R600-NEXT:     CNDE_INT T0.Z, PV.W, PV.Z, T0.X,
+; R600-NEXT:     MAX_INT T2.W, T1.Y, literal.x,
+; R600-NEXT:     MUL_IEEE * T3.W, PS, literal.y,
+; R600-NEXT:    -330(nan), 209715200(1.972152e-31)
+; R600-NEXT:     MUL_IEEE T2.X, PS, literal.x,
+; R600-NEXT:     ADD_INT T2.Y, PV.W, literal.y,
+; R600-NEXT:     ADD_INT T1.Z, T1.Y, literal.z,
+; R600-NEXT:     MIN_INT T2.W, T1.Y, literal.w,
+; R600-NEXT:     TRUNC * T4.W, T0.Y,
+; R600-NEXT:    209715200(1.972152e-31), 204(2.858649e-43)
+; R600-NEXT:    102(1.429324e-43), 381(5.338947e-43)
+; R600-NEXT:     FLT_TO_INT T3.X, PS,
+; R600-NEXT:     ADD_INT T0.Y, PV.W, literal.x,
+; R600-NEXT:     ADD_INT T2.Z, T1.Y, literal.y,
+; R600-NEXT:     SETGT_UINT T2.W, T1.Y, literal.z,
+; R600-NEXT:     CNDE_INT * T1.W, T1.W, PV.Y, PV.Z,
+; R600-NEXT:    -254(nan), -127(nan)
+; R600-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T4.X, T1.X, literal.x,
+; R600-NEXT:     MUL_IEEE T2.Y, T0.X, literal.x, BS:VEC_120/SCL_212
+; R600-NEXT:     CNDE_INT T1.Z, T0.W, PS, T1.Y,
+; R600-NEXT:     CNDE_INT T0.W, PV.W, PV.Z, PV.Y,
+; R600-NEXT:     MAX_INT * T1.W, PV.X, literal.y,
 ; R600-NEXT:    2130706432(1.701412e+38), -330(nan)
-; R600-NEXT:    381(5.338947e-43), 0(0.000000e+00)
-; R600-NEXT:     FLT_TO_INT T2.X, PS,
-; R600-NEXT:     ADD_INT T2.Y, PV.W, literal.x,
-; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
-; R600-NEXT:     ADD_INT T0.W, T1.W, literal.z,
-; R600-NEXT:     SETGT_UINT * T2.W, T1.W, literal.w,
-; R600-NEXT:    -254(nan), 204(2.858649e-43)
-; R600-NEXT:    102(1.429324e-43), -229(nan)
-; R600-NEXT:     ADD_INT T3.X, T1.W, literal.x,
-; R600-NEXT:     CNDE_INT T3.Y, PS, PV.Z, PV.W,
-; R600-NEXT:     SETGT_INT T0.Z, T1.W, literal.x,
-; R600-NEXT:     MUL_IEEE T0.W, T0.X, literal.y,
-; R600-NEXT:     MUL_IEEE * T4.W, T0.Y, literal.y,
-; R600-NEXT:    -127(nan), 209715200(1.972152e-31)
-; R600-NEXT:     MUL_IEEE T4.X, PS, literal.x,
-; R600-NEXT:     MUL_IEEE T4.Y, PV.W, literal.x,
-; R600-NEXT:     CNDE_INT T1.Z, PV.Z, PV.Y, T1.W,
-; R600-NEXT:     CNDE_INT T3.W, T3.W, PV.X, T2.Y,
-; R600-NEXT:     MAX_INT * T5.W, T2.X, literal.y,
-; R600-NEXT:    209715200(1.972152e-31), -330(nan)
-; R600-NEXT:     SETGT_INT T3.X, T1.W, literal.x,
-; R600-NEXT:     ADD_INT T2.Y, PS, literal.y,
-; R600-NEXT:     ADD_INT T2.Z, T2.X, literal.z,
-; R600-NEXT:     SETGT_UINT * T1.W, T2.X, literal.w,
+; R600-NEXT:     SETGT_INT T0.X, T1.Y, literal.x,
+; R600-NEXT:     ADD_INT T0.Y, PS, literal.y,
+; R600-NEXT:     ADD_INT T2.Z, T3.X, literal.z,
+; R600-NEXT:     SETGT_UINT * T1.W, T3.X, literal.w,
 ; R600-NEXT:    127(1.779649e-43), 204(2.858649e-43)
 ; R600-NEXT:    102(1.429324e-43), -229(nan)
-; R600-NEXT:     MIN_INT * T5.W, T2.X, literal.x,
+; R600-NEXT:     MIN_INT * T4.W, T3.X, literal.x,
 ; R600-NEXT:    381(5.338947e-43), 0(0.000000e+00)
 ; R600-NEXT:     ADD_INT T5.X, PV.W, literal.x,
-; R600-NEXT:     ADD_INT T3.Y, T2.X, literal.y,
-; R600-NEXT:     SETGT_UINT T3.Z, T2.X, literal.z,
-; R600-NEXT:     CNDE_INT T5.W, T1.W, T2.Y, T2.Z,
-; R600-NEXT:     SETGT_INT * T6.W, T2.X, literal.y,
+; R600-NEXT:     ADD_INT T1.Y, T3.X, literal.y,
+; R600-NEXT:     SETGT_UINT T3.Z, T3.X, literal.z,
+; R600-NEXT:     CNDE_INT T4.W, T1.W, T0.Y, T2.Z,
+; R600-NEXT:     SETGT_INT * T5.W, T3.X, literal.y,
 ; R600-NEXT:    -254(nan), -127(nan)
 ; R600-NEXT:    254(3.559298e-43), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T6.X, PS, PV.W, T2.X,
-; R600-NEXT:     CNDE_INT T2.Y, PV.Z, PV.Y, PV.X,
-; R600-NEXT:     SETGT_INT T2.Z, T2.X, literal.x, BS:VEC_120/SCL_212
-; R600-NEXT:     CNDE_INT T3.W, T3.X, T1.Z, T3.W, BS:VEC_021/SCL_122
-; R600-NEXT:     CNDE_INT * T0.W, T2.W, T4.Y, T0.W,
-; R600-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T0.X, T0.Z, PS, T0.X,
-; R600-NEXT:     LSHL T3.Y, PV.W, literal.x,
-; R600-NEXT:     CNDE_INT T0.Z, PV.Z, PV.X, PV.Y,
-; R600-NEXT:     CNDE_INT T0.W, T1.W, T4.X, T4.W,
-; R600-NEXT:     MUL_IEEE * T1.W, T1.Y, literal.y,
+; R600-NEXT:     CNDE_INT T6.X, PS, PV.W, T3.X,
+; R600-NEXT:     CNDE_INT T0.Y, PV.Z, PV.Y, PV.X,
+; R600-NEXT:     SETGT_INT T2.Z, T3.X, literal.x,
+; R600-NEXT:     CNDE_INT T0.W, T0.X, T1.Z, T0.W, BS:VEC_120/SCL_212
+; R600-NEXT:     MUL_IEEE * T4.W, T2.Y, literal.y,
+; R600-NEXT:    127(1.779649e-43), 2130706432(1.701412e+38)
+; R600-NEXT:     CNDE_INT T3.X, T2.W, T2.Y, PS, BS:VEC_120/SCL_212
+; R600-NEXT:     LSHL T1.Y, PV.W, literal.x,
+; R600-NEXT:     CNDE_INT T1.Z, PV.Z, PV.X, PV.Y,
+; R600-NEXT:     MUL_IEEE T0.W, T4.X, literal.y,
+; R600-NEXT:     CNDE_INT * T1.W, T1.W, T2.X, T3.W,
 ; R600-NEXT:    23(3.222986e-44), 2130706432(1.701412e+38)
-; R600-NEXT:     CNDE_INT T2.X, T3.Z, T1.Y, PS,
-; R600-NEXT:     CNDE_INT T0.Y, T6.W, PV.W, T0.Y,
-; R600-NEXT:     LSHL T0.Z, PV.Z, literal.x,
+; R600-NEXT:     CNDE_INT T1.X, T5.W, PS, T1.X, BS:VEC_021/SCL_122
+; R600-NEXT:     CNDE_INT T0.Y, T3.Z, T4.X, PV.W, BS:VEC_201
+; R600-NEXT:     LSHL T1.Z, PV.Z, literal.x,
 ; R600-NEXT:     ADD_INT T0.W, PV.Y, literal.y,
-; R600-NEXT:     CNDE_INT * T1.W, T3.X, PV.X, T1.X,
+; R600-NEXT:     CNDE_INT * T1.W, T0.X, T0.Z, PV.X,
 ; R600-NEXT:    23(3.222986e-44), 1065353216(1.000000e+00)
 ; R600-NEXT:     MUL_IEEE T1.Y, PS, PV.W,
-; R600-NEXT:     SETGT T1.Z, literal.x, KC0[3].X,
+; R600-NEXT:     SETGT T0.Z, literal.x, KC0[3].X,
 ; R600-NEXT:     ADD_INT * T0.W, PV.Z, literal.y,
 ; R600-NEXT:    -1026650416(-1.032789e+02), 1065353216(1.000000e+00)
 ; R600-NEXT:    ALU clause starting at 101:
-; R600-NEXT:     CNDE_INT * T1.W, T2.Z, T0.Y, T2.X,
+; R600-NEXT:     CNDE_INT * T1.W, T2.Z, T1.X, T0.Y,
 ; R600-NEXT:     MUL_IEEE T0.Y, PV.W, T0.W,
-; R600-NEXT:     SETGT T0.Z, literal.x, KC0[2].W,
-; R600-NEXT:     CNDE T0.W, T1.Z, T1.Y, 0.0,
+; R600-NEXT:     SETGT T1.Z, literal.x, KC0[2].W,
+; R600-NEXT:     CNDE T0.W, T0.Z, T1.Y, 0.0,
 ; R600-NEXT:     SETGT * T1.W, KC0[3].X, literal.y,
 ; R600-NEXT:    -1026650416(-1.032789e+02), 1118925336(8.872284e+01)
 ; R600-NEXT:     CNDE T1.Y, PS, PV.W, literal.x,
@@ -721,118 +719,116 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ;
 ; CM-LABEL: s_exp_v2f32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 100, @4, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    ALU 18, @105, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 98, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 18, @103, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    ALU clause starting at 4:
 ; CM-NEXT:     AND_INT * T0.W, KC0[2].W, literal.x,
 ; CM-NEXT:    -4096(nan), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.Z, PV.W, literal.x,
 ; CM-NEXT:     ADD * T1.W, KC0[2].W, -PV.W,
+; CM-NEXT:     MUL_IEEE T0.Y, PV.W, literal.x,
+; CM-NEXT:     MUL_IEEE T0.Z, T0.W, literal.y,
+; CM-NEXT:     AND_INT * T2.W, KC0[3].X, literal.z,
+; CM-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
+; CM-NEXT:    -4096(nan), 0(0.000000e+00)
+; CM-NEXT:     ADD T1.Y, KC0[3].X, -PV.W,
+; CM-NEXT:     RNDNE T1.Z, PV.Z,
+; CM-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.x, PV.Y,
 ; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T1.Z, PV.W, literal.x,
-; CM-NEXT:     RNDNE * T2.W, PV.Z,
-; CM-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; CM-NEXT:     TRUNC T0.Y, PV.W,
-; CM-NEXT:     AND_INT T2.Z, KC0[3].X, literal.x,
-; CM-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.y, PV.Z,
-; CM-NEXT:    -4096(nan), 1069064192(1.442383e+00)
 ; CM-NEXT:     MULADD_IEEE T0.X, T0.W, literal.x, PV.W,
-; CM-NEXT:     MUL_IEEE T1.Y, PV.Z, literal.y,
-; CM-NEXT:     FLT_TO_INT T1.Z, PV.Y,
-; CM-NEXT:     ADD * T0.W, KC0[3].X, -PV.Z,
+; CM-NEXT:     ADD T0.Y, T0.Z, -PV.Z,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.x,
+; CM-NEXT:     MUL_IEEE * T0.W, T2.W, literal.y, BS:VEC_120/SCL_212
 ; CM-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
-; CM-NEXT:     ADD T1.X, T0.Z, -T2.W,
-; CM-NEXT:     MUL_IEEE T0.Y, PV.W, literal.x,
-; CM-NEXT:     MAX_INT T0.Z, PV.Z, literal.y,
-; CM-NEXT:     RNDNE * T1.W, PV.Y,
-; CM-NEXT:    967029397(3.122284e-04), -330(nan)
-; CM-NEXT:     TRUNC T2.X, PV.W,
-; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.x,
-; CM-NEXT:     MULADD_IEEE T0.Z, T0.W, literal.y, PV.Y,
-; CM-NEXT:     ADD * T0.W, PV.X, T0.X,
-; CM-NEXT:    204(2.858649e-43), 1069064192(1.442383e+00)
-; CM-NEXT:     EXP_IEEE T0.X, T0.W,
-; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
-; CM-NEXT:     ADD_INT T1.X, T1.Z, literal.x,
-; CM-NEXT:     MULADD_IEEE T0.Y, T2.Z, literal.y, T0.Z, BS:VEC_102/SCL_221
-; CM-NEXT:     ADD T0.Z, T1.Y, -T1.W,
-; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.z,
-; CM-NEXT:    102(1.429324e-43), 967029397(3.122284e-04)
-; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_UINT T3.X, T1.Z, literal.x,
-; CM-NEXT:     MUL_IEEE T1.Y, PV.W, literal.y,
-; CM-NEXT:     SETGT_UINT T2.Z, T1.Z, literal.z,
-; CM-NEXT:     ADD * T1.W, PV.Z, PV.Y,
-; CM-NEXT:    -229(nan), 2130706432(1.701412e+38)
-; CM-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; CM-NEXT:     TRUNC T1.X, T1.Z,
+; CM-NEXT:     RNDNE T2.Y, PV.W,
+; CM-NEXT:     MULADD_IEEE T0.Z, T1.Y, literal.x, PV.Z,
+; CM-NEXT:     ADD * T1.W, PV.Y, PV.X,
+; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
+; CM-NEXT:     EXP_IEEE T0.X, T1.W,
+; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T1.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
+; CM-NEXT:     MULADD_IEEE T2.X, T2.W, literal.x, T0.Z,
+; CM-NEXT:     ADD T0.Y, T0.W, -T2.Y, BS:VEC_120/SCL_212
+; CM-NEXT:     FLT_TO_INT T0.Z, T1.X,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.y,
+; CM-NEXT:    967029397(3.122284e-04), 209715200(1.972152e-31)
+; CM-NEXT:     MUL_IEEE T1.X, PV.W, literal.x,
+; CM-NEXT:     SETGT_UINT T1.Y, PV.Z, literal.y,
+; CM-NEXT:     TRUNC T1.Z, T2.Y,
+; CM-NEXT:     ADD * T1.W, PV.Y, PV.X,
+; CM-NEXT:    209715200(1.972152e-31), -229(nan)
 ; CM-NEXT:     EXP_IEEE T0.X (MASKED), T1.W,
 ; CM-NEXT:     EXP_IEEE T0.Y, T1.W,
 ; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
 ; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
-; CM-NEXT:     CNDE_INT T4.X, T2.Z, T0.W, T1.Y,
-; CM-NEXT:     CNDE_INT T1.Y, T3.X, T2.Y, T1.X,
-; CM-NEXT:     FLT_TO_INT T0.Z, T2.X, BS:VEC_120/SCL_212
-; CM-NEXT:     MUL_IEEE * T0.W, PV.Y, literal.x,
-; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_INT T1.X, T1.Z, literal.x,
-; CM-NEXT:     MUL_IEEE T2.Y, T0.X, literal.y,
-; CM-NEXT:     MUL_IEEE T3.Z, PV.W, literal.z,
-; CM-NEXT:     SETGT_UINT * T1.W, PV.Z, literal.w,
-; CM-NEXT:    -127(nan), 209715200(1.972152e-31)
-; CM-NEXT:    2130706432(1.701412e+38), 254(3.559298e-43)
-; CM-NEXT:     CNDE_INT T2.X, PV.W, T0.W, PV.Z,
+; CM-NEXT:     FLT_TO_INT T2.X, T1.Z,
+; CM-NEXT:     MUL_IEEE T2.Y, PV.Y, literal.x,
+; CM-NEXT:     CNDE_INT T1.Z, T1.Y, T1.X, T0.W,
+; CM-NEXT:     SETGT_INT * T0.W, T0.Z, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:    209715200(1.972152e-31), -127(nan)
+; CM-NEXT:     CNDE_INT T1.X, PV.W, PV.Z, T0.X,
 ; CM-NEXT:     MUL_IEEE T3.Y, PV.Y, literal.x,
-; CM-NEXT:     CNDE_INT T3.Z, PV.X, T1.Y, T1.Z,
-; CM-NEXT:     MAX_INT * T0.W, T0.Z, literal.y,
-; CM-NEXT:    209715200(1.972152e-31), -330(nan)
-; CM-NEXT:     ADD_INT T5.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, T0.Z, literal.y,
-; CM-NEXT:     SETGT_UINT T4.Z, T0.Z, literal.z,
-; CM-NEXT:     MUL_IEEE * T0.W, T0.Y, literal.w,
+; CM-NEXT:     SETGT_UINT T1.Z, PV.X, literal.y,
+; CM-NEXT:     MAX_INT * T1.W, T0.Z, literal.z,
+; CM-NEXT:    209715200(1.972152e-31), -229(nan)
+; CM-NEXT:    -330(nan), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T3.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T4.Y, T0.Z, literal.y,
+; CM-NEXT:     CNDE_INT T2.Z, PV.Z, PV.Y, T2.Y,
+; CM-NEXT:     SETGT_INT * T1.W, T2.X, literal.z,
 ; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
-; CM-NEXT:    -229(nan), 209715200(1.972152e-31)
-; CM-NEXT:     MUL_IEEE T6.X, PV.W, literal.x,
-; CM-NEXT:     MIN_INT T4.Y, T0.Z, literal.y,
-; CM-NEXT:     CNDE_INT T5.Z, PV.Z, PV.X, PV.Y,
-; CM-NEXT:     SETGT_INT * T2.W, T0.Z, literal.z,
-; CM-NEXT:    209715200(1.972152e-31), 381(5.338947e-43)
-; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T5.X, PV.W, PV.Z, T0.Z,
-; CM-NEXT:     MIN_INT T1.Y, T1.Z, literal.x,
-; CM-NEXT:     ADD_INT T5.Z, PV.Y, literal.y,
-; CM-NEXT:     ADD_INT * T3.W, T0.Z, literal.z, BS:VEC_120/SCL_212
-; CM-NEXT:    381(5.338947e-43), -254(nan)
 ; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T7.X, T1.W, PV.W, PV.Z,
-; CM-NEXT:     SETGT_INT T4.Y, T0.Z, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, PV.Y, literal.y,
-; CM-NEXT:     ADD_INT * T1.W, T1.Z, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:     CNDE_INT T4.X, PV.W, PV.Z, T0.Y,
+; CM-NEXT:     MUL_IEEE T2.Y, T0.X, literal.x,
+; CM-NEXT:     MAX_INT T2.Z, T2.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     CNDE_INT * T2.W, T1.Y, PV.X, PV.Y,
+; CM-NEXT:    2130706432(1.701412e+38), -330(nan)
+; CM-NEXT:     CNDE_INT T0.X, T0.W, PV.W, T0.Z,
+; CM-NEXT:     ADD_INT T1.Y, PV.Z, literal.x,
+; CM-NEXT:     ADD_INT T2.Z, T2.X, literal.y,
+; CM-NEXT:     MIN_INT * T0.W, T2.X, literal.z,
+; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T3.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T3.Y, T2.X, literal.y,
+; CM-NEXT:     SETGT_UINT T3.Z, T2.X, literal.z,
+; CM-NEXT:     CNDE_INT * T0.W, T1.Z, PV.Y, PV.Z,
+; CM-NEXT:    -254(nan), -127(nan)
+; CM-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T5.X, T0.Y, literal.x,
+; CM-NEXT:     CNDE_INT T0.Y, T1.W, PV.W, T2.X,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Z, PV.Y, PV.X,
+; CM-NEXT:     MIN_INT * T0.W, T0.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 381(5.338947e-43)
+; CM-NEXT:     SETGT_INT T2.X, T2.X, literal.x,
+; CM-NEXT:     ADD_INT T1.Y, PV.W, literal.y,
+; CM-NEXT:     ADD_INT T2.Z, T0.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T0.W, T0.Z, literal.w,
 ; CM-NEXT:    127(1.779649e-43), -254(nan)
-; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T8.X, T2.Z, PV.W, PV.Z,
-; CM-NEXT:     SETGT_INT T1.Y, T1.Z, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T5.X, PV.X,
-; CM-NEXT:     CNDE_INT * T0.W, T4.Z, T6.X, T0.W, BS:VEC_201
-; CM-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T5.X, T2.W, PV.W, T0.Y,
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     CNDE_INT T3.X, PV.W, PV.Z, PV.Y,
+; CM-NEXT:     SETGT_INT T1.Y, T0.Z, literal.x,
+; CM-NEXT:     CNDE_INT T0.Z, PV.X, T0.Y, T1.Z,
+; CM-NEXT:     MUL_IEEE * T1.W, T5.X, literal.y,
+; CM-NEXT:    127(1.779649e-43), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T5.X, T3.Z, T5.X, PV.W,
 ; CM-NEXT:     LSHL T0.Y, PV.Z, literal.x,
-; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T3.Z, PV.X,
-; CM-NEXT:     CNDE_INT * T0.W, T3.X, T3.Y, T2.Y, BS:VEC_201
-; CM-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T0.X, T1.X, PV.W, T0.X,
+; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T0.X, PV.X, BS:VEC_021/SCL_122
+; CM-NEXT:     MUL_IEEE * T1.W, T2.Y, literal.y,
+; CM-NEXT:    23(3.222986e-44), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T0.X, T0.W, T2.Y, PV.W,
 ; CM-NEXT:     LSHL T2.Y, PV.Z, literal.x,
 ; CM-NEXT:     ADD_INT * T0.Z, PV.Y, literal.y,
 ; CM-NEXT:    23(3.222986e-44), 1065353216(1.000000e+00)
-; CM-NEXT:    ALU clause starting at 105:
-; CM-NEXT:     CNDE_INT * T0.W, T4.Y, T5.X, T2.X,
-; CM-NEXT:     MUL_IEEE T1.X, PV.W, T0.Z,
+; CM-NEXT:    ALU clause starting at 103:
+; CM-NEXT:     CNDE_INT * T0.W, T2.X, T4.X, T5.X,
+; CM-NEXT:     MUL_IEEE T2.X, PV.W, T0.Z,
 ; CM-NEXT:     SETGT T0.Y, literal.x, KC0[3].X,
 ; CM-NEXT:     ADD_INT T0.Z, T2.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T1.Y, T0.X, T4.X, BS:VEC_120/SCL_212
+; CM-NEXT:     CNDE_INT * T0.W, T1.Y, T1.X, T0.X, BS:VEC_120/SCL_212
 ; CM-NEXT:    -1026650416(-1.032789e+02), 1065353216(1.000000e+00)
 ; CM-NEXT:     MUL_IEEE T0.X, PV.W, PV.Z,
 ; CM-NEXT:     SETGT T1.Y, literal.x, KC0[2].W,
@@ -1215,8 +1211,8 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ;
 ; R600-LABEL: s_exp_v3f32:
 ; R600:       ; %bb.0:
-; R600-NEXT:    ALU 100, @6, KC0[CB0:0-32], KC1[]
-; R600-NEXT:    ALU 69, @107, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    ALU 99, @6, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    ALU 69, @106, KC0[CB0:0-32], KC1[]
 ; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
 ; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
 ; R600-NEXT:    CF_END
@@ -1224,69 +1220,68 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; R600-NEXT:    ALU clause starting at 6:
 ; R600-NEXT:     AND_INT * T0.W, KC0[3].Y, literal.x,
 ; R600-NEXT:    -4096(nan), 0(0.000000e+00)
-; R600-NEXT:     ADD T1.W, KC0[3].Y, -PV.W,
-; R600-NEXT:     MUL_IEEE * T2.W, PV.W, literal.x,
+; R600-NEXT:     MUL_IEEE T1.W, PV.W, literal.x,
+; R600-NEXT:     ADD * T2.W, KC0[3].Y, -PV.W,
 ; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; R600-NEXT:     RNDNE T3.W, PS,
-; R600-NEXT:     MUL_IEEE * T4.W, PV.W, literal.x,
+; R600-NEXT:     RNDNE * T3.W, PV.W,
+; R600-NEXT:     TRUNC T4.W, PV.W,
+; R600-NEXT:     MUL_IEEE * T5.W, T2.W, literal.x,
 ; R600-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; R600-NEXT:     MULADD_IEEE T1.W, T1.W, literal.x, PS,
-; R600-NEXT:     TRUNC * T4.W, PV.W,
+; R600-NEXT:     MULADD_IEEE T2.W, T2.W, literal.x, PS,
+; R600-NEXT:     FLT_TO_INT * T4.W, PV.W,
 ; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; R600-NEXT:     FLT_TO_INT T0.Z, PS,
-; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.x, PV.W,
-; R600-NEXT:     ADD * T1.W, T2.W, -T3.W,
-; R600-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; R600-NEXT:     ADD T0.W, PS, PV.W,
-; R600-NEXT:     MAX_INT * T1.W, PV.Z, literal.x,
-; R600-NEXT:    -330(nan), 0(0.000000e+00)
-; R600-NEXT:     ADD_INT T0.Y, PS, literal.x,
-; R600-NEXT:     ADD_INT T1.Z, T0.Z, literal.y,
-; R600-NEXT:     SETGT_UINT T1.W, T0.Z, literal.z,
-; R600-NEXT:     EXP_IEEE * T0.X, PV.W,
+; R600-NEXT:     MAX_INT T0.Z, PS, literal.x,
+; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.y, PV.W,
+; R600-NEXT:     ADD * T1.W, T1.W, -T3.W,
+; R600-NEXT:    -330(nan), 967029397(3.122284e-04)
+; R600-NEXT:     ADD T0.Y, PS, PV.W,
+; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.x,
+; R600-NEXT:     ADD_INT T0.W, T4.W, literal.y,
+; R600-NEXT:     SETGT_UINT * T1.W, T4.W, literal.z,
 ; R600-NEXT:    204(2.858649e-43), 102(1.429324e-43)
 ; R600-NEXT:    -229(nan), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
-; R600-NEXT:     SETGT_INT T0.W, T0.Z, literal.x,
-; R600-NEXT:     MUL_IEEE * T2.W, PS, literal.y,
-; R600-NEXT:    -127(nan), 209715200(1.972152e-31)
-; R600-NEXT:     MUL_IEEE T0.Y, PS, literal.x,
-; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Z, T0.Z,
-; R600-NEXT:     MIN_INT T3.W, T0.Z, literal.y,
-; R600-NEXT:     AND_INT * T4.W, KC0[3].W, literal.z,
-; R600-NEXT:    209715200(1.972152e-31), 381(5.338947e-43)
-; R600-NEXT:    -4096(nan), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T1.X, T0.X, literal.x,
-; R600-NEXT:     ADD T1.Y, KC0[3].W, -PS,
-; R600-NEXT:     ADD_INT T2.Z, PV.W, literal.y,
-; R600-NEXT:     ADD_INT T3.W, T0.Z, literal.z,
-; R600-NEXT:     SETGT_UINT * T5.W, T0.Z, literal.w,
-; R600-NEXT:    2130706432(1.701412e+38), -254(nan)
+; R600-NEXT:     CNDE_INT T0.Z, PS, PV.Z, PV.W,
+; R600-NEXT:     SETGT_INT T0.W, T4.W, literal.x,
+; R600-NEXT:     EXP_IEEE * T0.X, PV.Y,
+; R600-NEXT:    -127(nan), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T1.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T0.Y, PV.W, PV.Z, T4.W,
+; R600-NEXT:     MIN_INT T0.Z, T4.W, literal.y,
+; R600-NEXT:     AND_INT T2.W, KC0[3].W, literal.z,
+; R600-NEXT:     MUL_IEEE * T3.W, PS, literal.w,
+; R600-NEXT:    2130706432(1.701412e+38), 381(5.338947e-43)
+; R600-NEXT:    -4096(nan), 209715200(1.972152e-31)
+; R600-NEXT:     MUL_IEEE T2.X, PS, literal.x,
+; R600-NEXT:     ADD T1.Y, KC0[3].W, -PV.W,
+; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
+; R600-NEXT:     ADD_INT T5.W, T4.W, literal.z,
+; R600-NEXT:     SETGT_UINT * T6.W, T4.W, literal.w,
+; R600-NEXT:    209715200(1.972152e-31), -254(nan)
 ; R600-NEXT:    -127(nan), 254(3.559298e-43)
-; R600-NEXT:     CNDE_INT T2.X, PS, PV.W, PV.Z,
-; R600-NEXT:     SETGT_INT T2.Y, T0.Z, literal.x,
+; R600-NEXT:     CNDE_INT T3.X, PS, PV.W, PV.Z,
+; R600-NEXT:     SETGT_INT T2.Y, T4.W, literal.x,
 ; R600-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.y,
-; R600-NEXT:     MUL_IEEE T3.W, T4.W, literal.z,
-; R600-NEXT:     MUL_IEEE * T6.W, PV.X, literal.w,
+; R600-NEXT:     MUL_IEEE * T4.W, T2.W, literal.z, BS:VEC_120/SCL_212
 ; R600-NEXT:    127(1.779649e-43), 967029397(3.122284e-04)
-; R600-NEXT:    1069064192(1.442383e+00), 2130706432(1.701412e+38)
-; R600-NEXT:     CNDE_INT T1.X, T5.W, T1.X, PS, BS:VEC_120/SCL_212
-; R600-NEXT:     RNDNE T3.Y, PV.W,
-; R600-NEXT:     MULADD_IEEE T0.Z, T1.Y, literal.x, PV.Z,
-; R600-NEXT:     CNDE_INT T5.W, PV.Y, T1.Z, PV.X,
-; R600-NEXT:     CNDE_INT * T1.W, T1.W, T0.Y, T2.W,
 ; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T0.X, T0.W, PS, T0.X,
+; R600-NEXT:     CNDE_INT * T1.W, T1.W, T2.X, T3.W,
+; R600-NEXT:     CNDE_INT T0.X, T0.W, PV.W, T0.X, BS:VEC_021/SCL_122
+; R600-NEXT:     RNDNE T3.Y, T4.W, BS:VEC_120/SCL_212
+; R600-NEXT:     MULADD_IEEE T0.Z, T1.Y, literal.x, T0.Z,
+; R600-NEXT:     CNDE_INT T0.W, T2.Y, T0.Y, T3.X, BS:VEC_120/SCL_212
+; R600-NEXT:     MUL_IEEE * T1.W, T1.X, literal.y,
+; R600-NEXT:    1069064192(1.442383e+00), 2130706432(1.701412e+38)
+; R600-NEXT:     CNDE_INT T1.X, T6.W, T1.X, PS,
 ; R600-NEXT:     LSHL T0.Y, PV.W, literal.x,
 ; R600-NEXT:     AND_INT T1.Z, KC0[3].Z, literal.y,
-; R600-NEXT:     MULADD_IEEE T0.W, T4.W, literal.z, PV.Z, BS:VEC_120/SCL_212
-; R600-NEXT:     ADD * T1.W, T3.W, -PV.Y,
+; R600-NEXT:     MULADD_IEEE T0.W, T2.W, literal.z, PV.Z, BS:VEC_120/SCL_212
+; R600-NEXT:     ADD * T1.W, T4.W, -PV.Y,
 ; R600-NEXT:    23(3.222986e-44), -4096(nan)
 ; R600-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
 ; R600-NEXT:     ADD T1.Y, PS, PV.W,
 ; R600-NEXT:     MUL_IEEE T0.Z, PV.Z, literal.x,
 ; R600-NEXT:     ADD_INT T0.W, PV.Y, literal.y,
-; R600-NEXT:     CNDE_INT * T1.W, T2.Y, PV.X, T1.X,
+; R600-NEXT:     CNDE_INT * T1.W, T2.Y, T0.X, PV.X,
 ; R600-NEXT:    1069064192(1.442383e+00), 1065353216(1.000000e+00)
 ; R600-NEXT:     MUL_IEEE T0.X, PS, PV.W,
 ; R600-NEXT:     ADD T0.Y, KC0[3].Z, -T1.Z,
@@ -1300,12 +1295,12 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; R600-NEXT:     MUL_IEEE * T1.W, PS, literal.z,
 ; R600-NEXT:    -1026650416(-1.032789e+02), 967029397(3.122284e-04)
 ; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T3.X, T1.X, literal.x,
-; R600-NEXT:     MUL_IEEE T2.Y, PS, literal.y,
+; R600-NEXT:     MUL_IEEE T3.X, PS, literal.x,
+; R600-NEXT:     MUL_IEEE T2.Y, T1.X, literal.y,
 ; R600-NEXT:     MULADD_IEEE T4.Z, T0.Y, literal.z, PV.W,
 ; R600-NEXT:     FLT_TO_INT T0.W, PV.Z,
 ; R600-NEXT:     MIN_INT * T2.W, PV.Y, literal.w,
-; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
+; R600-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
 ; R600-NEXT:    1069064192(1.442383e+00), 381(5.338947e-43)
 ; R600-NEXT:     ADD_INT T4.X, PS, literal.x,
 ; R600-NEXT:     MAX_INT T0.Y, PV.W, literal.y,
@@ -1323,7 +1318,7 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; R600-NEXT:    102(1.429324e-43), -229(nan)
 ; R600-NEXT:     ADD_INT * T6.X, T0.W, literal.x,
 ; R600-NEXT:    -127(nan), 0(0.000000e+00)
-; R600-NEXT:    ALU clause starting at 107:
+; R600-NEXT:    ALU clause starting at 106:
 ; R600-NEXT:     SETGT_UINT T0.Y, T0.W, literal.x,
 ; R600-NEXT:     CNDE_INT T0.Z, T3.W, T0.Z, T2.W, BS:VEC_102/SCL_221
 ; R600-NEXT:     SETGT_INT T2.W, T0.W, literal.y,
@@ -1339,25 +1334,25 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; R600-NEXT:     SETGT_UINT T5.X, T1.Y, literal.x,
 ; R600-NEXT:     CNDE_INT T4.Y, PS, PV.Z, PV.W,
 ; R600-NEXT:     MAX_INT T0.Z, T1.Y, literal.y,
-; R600-NEXT:     MUL_IEEE T4.W, T1.Z, literal.z,
-; R600-NEXT:     MUL_IEEE * T5.W, PV.Y, literal.w,
+; R600-NEXT:     MUL_IEEE T4.W, PV.Y, literal.z,
+; R600-NEXT:     MUL_IEEE * T5.W, T1.Z, literal.w,
 ; R600-NEXT:    254(3.559298e-43), -330(nan)
-; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
-; R600-NEXT:     CNDE_INT T6.X, T3.W, PS, T3.Y, BS:VEC_021/SCL_122
-; R600-NEXT:     MUL_IEEE T3.Y, PV.W, literal.x,
+; R600-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
+; R600-NEXT:     MUL_IEEE T6.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T3.Y, T3.W, PV.W, T3.Y, BS:VEC_021/SCL_122
 ; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
 ; R600-NEXT:     ADD_INT T3.W, T1.Y, literal.z,
-; R600-NEXT:     SETGT_UINT * T5.W, T1.Y, literal.w,
+; R600-NEXT:     SETGT_UINT * T4.W, T1.Y, literal.w,
 ; R600-NEXT:    2130706432(1.701412e+38), 204(2.858649e-43)
 ; R600-NEXT:    102(1.429324e-43), -229(nan)
 ; R600-NEXT:     CNDE_INT T8.X, PS, PV.Z, PV.W,
 ; R600-NEXT:     SETGT_INT T5.Y, T1.Y, literal.x,
-; R600-NEXT:     CNDE_INT T0.Z, T0.Y, T4.W, PV.Y, BS:VEC_120/SCL_212
-; R600-NEXT:     CNDE_INT T2.W, T2.W, PV.X, T1.Z,
+; R600-NEXT:     CNDE_INT T0.Z, T2.W, PV.Y, T1.Z,
+; R600-NEXT:     CNDE_INT T2.W, T0.Y, T5.W, PV.X, BS:VEC_120/SCL_212
 ; R600-NEXT:     LSHL * T3.W, T4.Y, literal.y,
 ; R600-NEXT:    -127(nan), 23(3.222986e-44)
 ; R600-NEXT:     ADD_INT T6.X, PS, literal.x,
-; R600-NEXT:     CNDE_INT T0.Y, T0.W, PV.W, PV.Z,
+; R600-NEXT:     CNDE_INT T0.Y, T0.W, PV.Z, PV.W,
 ; R600-NEXT:     CNDE_INT T0.Z, PV.Y, PV.X, T1.Y,
 ; R600-NEXT:     CNDE_INT T0.W, T5.X, T7.X, T4.X,
 ; R600-NEXT:     SETGT_INT * T2.W, T1.Y, literal.y,
@@ -1365,18 +1360,18 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; R600-NEXT:     CNDE_INT T4.X, PS, PV.Z, PV.W,
 ; R600-NEXT:     MUL_IEEE T0.Y, PV.Y, PV.X,
 ; R600-NEXT:     SETGT T0.Z, literal.x, KC0[3].Z,
-; R600-NEXT:     CNDE_INT T0.W, T5.W, T2.Y, T1.W,
-; R600-NEXT:     MUL_IEEE * T1.W, T3.X, literal.y,
+; R600-NEXT:     MUL_IEEE T0.W, T2.Y, literal.y,
+; R600-NEXT:     CNDE_INT * T1.W, T4.W, T3.X, T1.W,
 ; R600-NEXT:    -1026650416(-1.032789e+02), 2130706432(1.701412e+38)
-; R600-NEXT:     CNDE_INT T3.X, T5.X, T3.X, PS,
-; R600-NEXT:     CNDE_INT T1.Y, T5.Y, PV.W, T1.X,
+; R600-NEXT:     CNDE_INT T1.X, T5.Y, PS, T1.X,
+; R600-NEXT:     CNDE_INT T1.Y, T5.X, T2.Y, PV.W,
 ; R600-NEXT:     CNDE T0.Z, PV.Z, PV.Y, 0.0,
 ; R600-NEXT:     SETGT T0.W, KC0[3].Z, literal.x,
 ; R600-NEXT:     LSHL * T1.W, PV.X, literal.y,
 ; R600-NEXT:    1118925336(8.872284e+01), 23(3.222986e-44)
-; R600-NEXT:     ADD_INT T1.X, PS, literal.x,
+; R600-NEXT:     ADD_INT T3.X, PS, literal.x,
 ; R600-NEXT:     CNDE T0.Y, PV.W, PV.Z, literal.y,
-; R600-NEXT:     CNDE_INT T0.Z, T2.W, PV.Y, PV.X,
+; R600-NEXT:     CNDE_INT T0.Z, T2.W, PV.X, PV.Y,
 ; R600-NEXT:     CNDE T0.W, T2.X, T0.X, 0.0,
 ; R600-NEXT:     SETGT * T1.W, KC0[3].Y, literal.z,
 ; R600-NEXT:    1065353216(1.000000e+00), 2139095040(INF)
@@ -1397,197 +1392,193 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ;
 ; CM-LABEL: s_exp_v3f32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 102, @6, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    ALU 80, @109, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T0.X
+; CM-NEXT:    ALU 101, @6, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 77, @108, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T3.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
 ; CM-NEXT:    ALU clause starting at 6:
 ; CM-NEXT:     AND_INT * T0.W, KC0[3].Y, literal.x,
 ; CM-NEXT:    -4096(nan), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.Z, PV.W, literal.x,
 ; CM-NEXT:     ADD * T1.W, KC0[3].Y, -PV.W,
-; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T1.Z, PV.W, literal.x,
-; CM-NEXT:     RNDNE * T2.W, PV.Z,
-; CM-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; CM-NEXT:     TRUNC T2.Z, PV.W,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.W, literal.x,
+; CM-NEXT:     MUL_IEEE * T2.W, T0.W, literal.y,
+; CM-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
+; CM-NEXT:     RNDNE T1.Z, PV.W,
 ; CM-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.x, PV.Z,
 ; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; CM-NEXT:     MULADD_IEEE T0.Y, T0.W, literal.x, PV.W,
-; CM-NEXT:     ADD T0.Z, T0.Z, -T2.W,
-; CM-NEXT:     FLT_TO_INT * T0.W, PV.Z,
+; CM-NEXT:     MULADD_IEEE T0.Z, T0.W, literal.x, PV.W,
+; CM-NEXT:     ADD * T0.W, T2.W, -PV.Z, BS:VEC_120/SCL_212
 ; CM-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; CM-NEXT:     MIN_INT T1.Z, PV.W, literal.x,
-; CM-NEXT:     ADD * T1.W, PV.Z, PV.Y,
+; CM-NEXT:     TRUNC T1.Z, T1.Z,
+; CM-NEXT:     ADD * T0.W, PV.W, PV.Z,
+; CM-NEXT:     EXP_IEEE T0.X, T0.W,
+; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
+; CM-NEXT:     FLT_TO_INT T0.Z, T1.Z,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T0.Y, PV.W, literal.x,
+; CM-NEXT:     MAX_INT T1.Z, PV.Z, literal.y,
+; CM-NEXT:     MIN_INT * T1.W, PV.Z, literal.z,
+; CM-NEXT:    209715200(1.972152e-31), -330(nan)
 ; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
-; CM-NEXT:     EXP_IEEE T0.X, T1.W,
-; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
-; CM-NEXT:     MUL_IEEE T0.Y, PV.X, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, T1.Z, literal.y,
-; CM-NEXT:     MAX_INT * T1.W, T0.W, literal.z,
-; CM-NEXT:    2130706432(1.701412e+38), -254(nan)
-; CM-NEXT:    -330(nan), 0(0.000000e+00)
-; CM-NEXT:     ADD_INT T1.X, T0.W, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, PV.W, literal.y,
-; CM-NEXT:     ADD_INT T1.Z, T0.W, literal.z,
-; CM-NEXT:     SETGT_UINT * T1.W, T0.W, literal.w,
-; CM-NEXT:    -127(nan), 204(2.858649e-43)
+; CM-NEXT:     ADD_INT T1.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T1.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T1.Z, T0.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T1.W, T0.Z, literal.w,
+; CM-NEXT:    -254(nan), 204(2.858649e-43)
 ; CM-NEXT:    102(1.429324e-43), -229(nan)
-; CM-NEXT:     SETGT_UINT T2.X, T0.W, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     SETGT_INT T1.Z, T0.W, literal.y,
-; CM-NEXT:     MUL_IEEE * T2.W, T0.X, literal.z,
-; CM-NEXT:    254(3.559298e-43), -127(nan)
-; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T3.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.Z, PV.Y, T0.W,
-; CM-NEXT:     CNDE_INT T0.Z, PV.X, T1.X, T0.Z,
-; CM-NEXT:     SETGT_INT * T0.W, T0.W, literal.y,
-; CM-NEXT:    209715200(1.972152e-31), 127(1.779649e-43)
+; CM-NEXT:     ADD_INT T2.X, T0.Z, literal.x,
+; CM-NEXT:     SETGT_UINT T2.Y, T0.Z, literal.y,
+; CM-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     SETGT_INT * T2.W, T0.Z, literal.x,
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     MUL_IEEE T3.X, T0.X, literal.x,
+; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Z, T0.Z,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Y, PV.X, T1.X,
+; CM-NEXT:     SETGT_INT * T3.W, T0.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 127(1.779649e-43)
 ; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     CNDE_INT T0.Z, T1.W, PV.X, T2.W,
-; CM-NEXT:     MUL_IEEE * T1.W, T0.Y, literal.x,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.X, literal.x,
+; CM-NEXT:     CNDE_INT * T0.W, T1.W, T0.Y, T0.W,
 ; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T1.X, T2.X, T0.Y, PV.W,
-; CM-NEXT:     CNDE_INT T0.Y, T1.Z, PV.Z, T0.X,
+; CM-NEXT:     CNDE_INT T0.X, T2.W, PV.W, T0.X,
+; CM-NEXT:     CNDE_INT T0.Y, T2.Y, T3.X, PV.Z,
 ; CM-NEXT:     LSHL T0.Z, PV.Y, literal.x,
-; CM-NEXT:     AND_INT * T1.W, KC0[3].Z, literal.y,
+; CM-NEXT:     AND_INT * T0.W, KC0[3].Z, literal.y,
 ; CM-NEXT:    23(3.222986e-44), -4096(nan)
-; CM-NEXT:     MUL_IEEE T0.X, PV.W, literal.x,
 ; CM-NEXT:     ADD T1.Y, KC0[3].Z, -PV.W,
-; CM-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T0.W, PV.Y, PV.X,
-; CM-NEXT:    1069064192(1.442383e+00), 1065353216(1.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.Y, PV.W, PV.Z,
-; CM-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.x,
-; CM-NEXT:     RNDNE * T0.W, PV.X,
-; CM-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T0.Z, PV.Z, literal.x,
+; CM-NEXT:     CNDE_INT * T1.W, T3.W, PV.X, PV.Y,
+; CM-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T0.X, PV.W, PV.Z,
+; CM-NEXT:     MUL_IEEE T0.Y, PV.Y, literal.x,
+; CM-NEXT:     MUL_IEEE T0.Z, T0.W, literal.y,
+; CM-NEXT:     AND_INT * T1.W, KC0[3].W, literal.z,
+; CM-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
+; CM-NEXT:    -4096(nan), 0(0.000000e+00)
 ; CM-NEXT:     SETGT T1.X, literal.x, KC0[3].Y,
-; CM-NEXT:     TRUNC T2.Y, PV.W,
-; CM-NEXT:     AND_INT T1.Z, KC0[3].W, literal.y,
-; CM-NEXT:     MULADD_IEEE * T2.W, T1.Y, literal.z, PV.Z,
-; CM-NEXT:    -1026650416(-1.032789e+02), -4096(nan)
-; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; CM-NEXT:     MULADD_IEEE T2.X, T1.W, literal.x, PV.W,
-; CM-NEXT:     MUL_IEEE T1.Y, PV.Z, literal.y,
-; CM-NEXT:     FLT_TO_INT T0.Z, PV.Y,
-; CM-NEXT:     ADD * T1.W, KC0[3].W, -PV.Z,
+; CM-NEXT:     ADD T2.Y, KC0[3].W, -PV.W,
+; CM-NEXT:     RNDNE T1.Z, PV.Z,
+; CM-NEXT:     MULADD_IEEE * T2.W, T1.Y, literal.y, PV.Y,
+; CM-NEXT:    -1026650416(-1.032789e+02), 1069064192(1.442383e+00)
+; CM-NEXT:     MULADD_IEEE T2.X, T0.W, literal.x, PV.W,
+; CM-NEXT:     ADD T0.Y, T0.Z, -PV.Z,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.x,
+; CM-NEXT:     MUL_IEEE * T0.W, T1.W, literal.y, BS:VEC_120/SCL_212
 ; CM-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
-; CM-NEXT:     ADD T0.X, T0.X, -T0.W,
-; CM-NEXT:     MUL_IEEE T2.Y, PV.W, literal.x,
-; CM-NEXT:     MAX_INT T2.Z, PV.Z, literal.y,
-; CM-NEXT:     RNDNE * T0.W, PV.Y,
-; CM-NEXT:    967029397(3.122284e-04), -330(nan)
-; CM-NEXT:     TRUNC T3.X, PV.W,
-; CM-NEXT:     ADD_INT T3.Y, PV.Z, literal.x,
-; CM-NEXT:     MULADD_IEEE T2.Z, T1.W, literal.y, PV.Y,
-; CM-NEXT:     ADD * T1.W, PV.X, T2.X,
-; CM-NEXT:    204(2.858649e-43), 1069064192(1.442383e+00)
-; CM-NEXT:     EXP_IEEE T0.X, T1.W,
-; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
-; CM-NEXT:     ADD_INT T2.X, T0.Z, literal.x,
-; CM-NEXT:     MULADD_IEEE T2.Y, T1.Z, literal.y, T2.Z, BS:VEC_102/SCL_221
-; CM-NEXT:     ADD T1.Z, T1.Y, -T0.W,
-; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.z,
-; CM-NEXT:    102(1.429324e-43), 967029397(3.122284e-04)
-; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_UINT T4.X, T0.Z, literal.x,
-; CM-NEXT:     MUL_IEEE T1.Y, PV.W, literal.y,
-; CM-NEXT:     SETGT_UINT T2.Z, T0.Z, literal.z,
-; CM-NEXT:     ADD * T1.W, PV.Z, PV.Y,
-; CM-NEXT:    -229(nan), 2130706432(1.701412e+38)
-; CM-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; CM-NEXT:     TRUNC T3.X, T1.Z,
+; CM-NEXT:     RNDNE T1.Y, PV.W,
+; CM-NEXT:     MULADD_IEEE T0.Z, T2.Y, literal.x, PV.Z,
+; CM-NEXT:     ADD * T2.W, PV.Y, PV.X,
+; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
+; CM-NEXT:     EXP_IEEE T0.X (MASKED), T2.W,
+; CM-NEXT:     EXP_IEEE T0.Y, T2.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T2.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T2.W,
+; CM-NEXT:     MULADD_IEEE T2.X, T1.W, literal.x, T0.Z,
+; CM-NEXT:     ADD T2.Y, T0.W, -T1.Y, BS:VEC_120/SCL_212
+; CM-NEXT:     FLT_TO_INT T0.Z, T3.X,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.Y, literal.y,
+; CM-NEXT:    967029397(3.122284e-04), 209715200(1.972152e-31)
+; CM-NEXT:     MUL_IEEE T3.X, PV.W, literal.x,
+; CM-NEXT:     SETGT_UINT T3.Y, PV.Z, literal.y,
+; CM-NEXT:     TRUNC T1.Z, T1.Y,
+; CM-NEXT:     ADD * T1.W, PV.Y, PV.X,
+; CM-NEXT:    209715200(1.972152e-31), -229(nan)
 ; CM-NEXT:     EXP_IEEE T1.X (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE T1.Y (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE T1.Z, T1.W,
+; CM-NEXT:     EXP_IEEE T1.Y, T1.W,
+; CM-NEXT:     EXP_IEEE T1.Z (MASKED), T1.W,
 ; CM-NEXT:     EXP_IEEE * T1.W (MASKED), T1.W,
-; CM-NEXT:    ALU clause starting at 109:
-; CM-NEXT:     CNDE_INT T5.X, T2.Z, T0.W, T1.Y,
-; CM-NEXT:     CNDE_INT T1.Y, T4.X, T3.Y, T2.X,
-; CM-NEXT:     FLT_TO_INT T3.Z, T3.X, BS:VEC_120/SCL_212
-; CM-NEXT:     MUL_IEEE * T0.W, T1.Z, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_INT T2.X, T0.Z, literal.x,
-; CM-NEXT:     MUL_IEEE T2.Y, T0.X, literal.y,
-; CM-NEXT:     MUL_IEEE T4.Z, PV.W, literal.z,
-; CM-NEXT:     SETGT_UINT * T1.W, PV.Z, literal.w,
-; CM-NEXT:    -127(nan), 209715200(1.972152e-31)
-; CM-NEXT:    2130706432(1.701412e+38), 254(3.559298e-43)
-; CM-NEXT:     CNDE_INT T3.X, PV.W, T0.W, PV.Z,
-; CM-NEXT:     MUL_IEEE T3.Y, PV.Y, literal.x,
-; CM-NEXT:     CNDE_INT T4.Z, PV.X, T1.Y, T0.Z,
-; CM-NEXT:     MAX_INT * T0.W, T3.Z, literal.y,
-; CM-NEXT:    209715200(1.972152e-31), -330(nan)
-; CM-NEXT:     ADD_INT T6.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, T3.Z, literal.y,
-; CM-NEXT:     SETGT_UINT T5.Z, T3.Z, literal.z,
-; CM-NEXT:     MUL_IEEE * T0.W, T1.Z, literal.w, BS:VEC_120/SCL_212
+; CM-NEXT:     FLT_TO_INT T2.X, T1.Z,
+; CM-NEXT:     MUL_IEEE T2.Y, PV.Y, literal.x,
+; CM-NEXT:     CNDE_INT T1.Z, T3.Y, T3.X, T0.W,
+; CM-NEXT:     SETGT_INT * T0.W, T0.Z, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:    209715200(1.972152e-31), -127(nan)
+; CM-NEXT:     CNDE_INT T3.X, PV.W, PV.Z, T0.Y,
+; CM-NEXT:     MUL_IEEE * T4.Y, PV.Y, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:    ALU clause starting at 108:
+; CM-NEXT:     SETGT_UINT T1.Z, T2.X, literal.x,
+; CM-NEXT:     MAX_INT * T1.W, T0.Z, literal.y,
+; CM-NEXT:    -229(nan), -330(nan)
+; CM-NEXT:     ADD_INT T4.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T5.Y, T0.Z, literal.y,
+; CM-NEXT:     CNDE_INT T2.Z, PV.Z, T4.Y, T2.Y,
+; CM-NEXT:     SETGT_INT * T1.W, T2.X, literal.z,
 ; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
-; CM-NEXT:    -229(nan), 209715200(1.972152e-31)
-; CM-NEXT:     MUL_IEEE T7.X, PV.W, literal.x,
-; CM-NEXT:     MIN_INT T4.Y, T3.Z, literal.y,
-; CM-NEXT:     CNDE_INT T6.Z, PV.Z, PV.X, PV.Y,
-; CM-NEXT:     SETGT_INT * T2.W, T3.Z, literal.z,
-; CM-NEXT:    209715200(1.972152e-31), 381(5.338947e-43)
 ; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T6.X, PV.W, PV.Z, T3.Z,
-; CM-NEXT:     MIN_INT T1.Y, T0.Z, literal.x,
-; CM-NEXT:     ADD_INT T6.Z, PV.Y, literal.y,
-; CM-NEXT:     ADD_INT * T3.W, T3.Z, literal.z, BS:VEC_120/SCL_212
-; CM-NEXT:    381(5.338947e-43), -254(nan)
-; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T8.X, T1.W, PV.W, PV.Z,
-; CM-NEXT:     SETGT_INT T4.Y, T3.Z, literal.x,
-; CM-NEXT:     ADD_INT T3.Z, PV.Y, literal.y,
-; CM-NEXT:     ADD_INT * T1.W, T0.Z, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:     CNDE_INT T5.X, PV.W, PV.Z, T1.Y,
+; CM-NEXT:     MUL_IEEE T0.Y, T0.Y, literal.x,
+; CM-NEXT:     MAX_INT T2.Z, T2.X, literal.y,
+; CM-NEXT:     CNDE_INT * T2.W, T3.Y, PV.X, PV.Y, BS:VEC_120/SCL_212
+; CM-NEXT:    2130706432(1.701412e+38), -330(nan)
+; CM-NEXT:     CNDE_INT T4.X, T0.W, PV.W, T0.Z,
+; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.x,
+; CM-NEXT:     ADD_INT T2.Z, T2.X, literal.y,
+; CM-NEXT:     MIN_INT * T0.W, T2.X, literal.z,
+; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T6.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T3.Y, T2.X, literal.y,
+; CM-NEXT:     SETGT_UINT T3.Z, T2.X, literal.z,
+; CM-NEXT:     CNDE_INT * T0.W, T1.Z, PV.Y, PV.Z,
+; CM-NEXT:    -254(nan), -127(nan)
+; CM-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T7.X, T1.Y, literal.x,
+; CM-NEXT:     CNDE_INT T1.Y, T1.W, PV.W, T2.X,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Z, PV.Y, PV.X,
+; CM-NEXT:     MIN_INT * T0.W, T0.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 381(5.338947e-43)
+; CM-NEXT:     SETGT_INT T2.X, T2.X, literal.x,
+; CM-NEXT:     ADD_INT T2.Y, PV.W, literal.y,
+; CM-NEXT:     ADD_INT T2.Z, T0.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T0.W, T0.Z, literal.w,
 ; CM-NEXT:    127(1.779649e-43), -254(nan)
-; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T9.X, T2.Z, PV.W, PV.Z,
-; CM-NEXT:     SETGT_INT T1.Y, T0.Z, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T6.X, PV.X,
-; CM-NEXT:     CNDE_INT * T0.W, T5.Z, T7.X, T0.W, BS:VEC_201
-; CM-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T6.X, T2.W, PV.W, T1.Z,
-; CM-NEXT:     LSHL T5.Y, PV.Z, literal.x,
-; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T4.Z, PV.X,
-; CM-NEXT:     CNDE_INT * T0.W, T4.X, T3.Y, T2.Y,
-; CM-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T0.X, T2.X, PV.W, T0.X,
-; CM-NEXT:     LSHL T2.Y, PV.Z, literal.x,
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     CNDE_INT T6.X, PV.W, PV.Z, PV.Y,
+; CM-NEXT:     SETGT_INT T2.Y, T0.Z, literal.x,
+; CM-NEXT:     CNDE_INT T0.Z, PV.X, T1.Y, T1.Z,
+; CM-NEXT:     MUL_IEEE * T1.W, T7.X, literal.y,
+; CM-NEXT:    127(1.779649e-43), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T7.X, T3.Z, T7.X, PV.W,
+; CM-NEXT:     LSHL T1.Y, PV.Z, literal.x,
+; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T4.X, PV.X, BS:VEC_021/SCL_122
+; CM-NEXT:     MUL_IEEE * T1.W, T0.Y, literal.y,
+; CM-NEXT:    23(3.222986e-44), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T4.X, T0.W, T0.Y, PV.W,
+; CM-NEXT:     LSHL T0.Y, PV.Z, literal.x,
 ; CM-NEXT:     ADD_INT T0.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T4.Y, PV.X, T3.X, BS:VEC_021/SCL_122
+; CM-NEXT:     CNDE_INT * T0.W, T2.X, T5.X, PV.X,
 ; CM-NEXT:    23(3.222986e-44), 1065353216(1.000000e+00)
 ; CM-NEXT:     MUL_IEEE T2.X, PV.W, PV.Z,
-; CM-NEXT:     SETGT T3.Y, literal.x, KC0[3].W,
+; CM-NEXT:     SETGT T1.Y, literal.x, KC0[3].W,
 ; CM-NEXT:     ADD_INT T0.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T1.Y, PV.X, T5.X,
+; CM-NEXT:     CNDE_INT * T0.W, T2.Y, T3.X, PV.X,
 ; CM-NEXT:    -1026650416(-1.032789e+02), 1065353216(1.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.X, PV.W, PV.Z,
-; CM-NEXT:     SETGT T1.Y, literal.x, KC0[3].Z,
+; CM-NEXT:     MUL_IEEE T3.X, PV.W, PV.Z,
+; CM-NEXT:     SETGT T0.Y, literal.x, KC0[3].Z,
 ; CM-NEXT:     CNDE T0.Z, PV.Y, PV.X, 0.0,
 ; CM-NEXT:     SETGT * T0.W, KC0[3].W, literal.y,
 ; CM-NEXT:    -1026650416(-1.032789e+02), 1118925336(8.872284e+01)
 ; CM-NEXT:     CNDE T2.X, PV.W, PV.Z, literal.x,
-; CM-NEXT:     CNDE T1.Y, PV.Y, PV.X, 0.0,
+; CM-NEXT:     CNDE T0.Y, PV.Y, PV.X, 0.0,
 ; CM-NEXT:     SETGT T0.Z, KC0[3].Z, literal.y,
 ; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
 ; CM-NEXT:    2139095040(INF), 1118925336(8.872284e+01)
 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T0.X, PV.W, literal.x,
-; CM-NEXT:     CNDE T1.Y, PV.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE T0.Z, T1.X, T0.Y, 0.0,
+; CM-NEXT:     LSHR T3.X, PV.W, literal.x,
+; CM-NEXT:     CNDE T0.Y, PV.Z, PV.Y, literal.y,
+; CM-NEXT:     CNDE T0.Z, T1.X, T0.X, 0.0,
 ; CM-NEXT:     SETGT * T0.W, KC0[3].Y, literal.z,
 ; CM-NEXT:    2(2.802597e-45), 2139095040(INF)
 ; CM-NEXT:    1118925336(8.872284e+01), 0(0.000000e+00)
-; CM-NEXT:     CNDE * T1.X, PV.W, PV.Z, literal.x,
+; CM-NEXT:     CNDE * T0.X, PV.W, PV.Z, literal.x,
 ; CM-NEXT:    2139095040(INF), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %result = call <3 x float> @llvm.exp.v3f32(<3 x float> %in)
   store <3 x float> %result, ptr addrspace(1) %out
@@ -2050,227 +2041,224 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; R600-LABEL: s_exp_v4f32:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    ALU 98, @6, KC0[CB0:0-32], KC1[]
-; R600-NEXT:    ALU 98, @105, KC0[CB0:0-32], KC1[]
-; R600-NEXT:    ALU 24, @204, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    ALU 95, @105, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    ALU 24, @201, KC0[CB0:0-32], KC1[]
 ; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
 ; R600-NEXT:    CF_END
 ; R600-NEXT:    PAD
 ; R600-NEXT:    ALU clause starting at 6:
 ; R600-NEXT:     AND_INT * T0.W, KC0[3].Z, literal.x,
 ; R600-NEXT:    -4096(nan), 0(0.000000e+00)
-; R600-NEXT:     ADD T1.W, KC0[3].Z, -PV.W,
-; R600-NEXT:     MUL_IEEE * T2.W, PV.W, literal.x,
+; R600-NEXT:     ADD * T1.W, KC0[3].Z, -PV.W,
+; R600-NEXT:     MUL_IEEE T2.W, PV.W, literal.x,
+; R600-NEXT:     MUL_IEEE * T3.W, T0.W, literal.y,
+; R600-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
+; R600-NEXT:     RNDNE T4.W, PS,
+; R600-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.x, PV.W, BS:VEC_021/SCL_122
 ; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; R600-NEXT:     RNDNE T3.W, PS,
-; R600-NEXT:     MUL_IEEE * T4.W, PV.W, literal.x,
+; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.x, PS,
+; R600-NEXT:     ADD * T1.W, T3.W, -PV.W,
 ; R600-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; R600-NEXT:     MULADD_IEEE T1.W, T1.W, literal.x, PS,
-; R600-NEXT:     TRUNC * T4.W, PV.W,
-; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; R600-NEXT:     FLT_TO_INT T0.Z, PS,
-; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.x, PV.W,
-; R600-NEXT:     ADD * T1.W, T2.W, -T3.W,
-; R600-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; R600-NEXT:     ADD T1.Z, PS, PV.W,
-; R600-NEXT:     MAX_INT T0.W, PV.Z, literal.x,
-; R600-NEXT:     MIN_INT * T1.W, PV.Z, literal.y,
-; R600-NEXT:    -330(nan), 381(5.338947e-43)
-; R600-NEXT:     ADD_INT T0.X, PS, literal.x,
-; R600-NEXT:     ADD_INT T0.Y, PV.W, literal.y,
-; R600-NEXT:     ADD_INT T2.Z, T0.Z, literal.z,
-; R600-NEXT:     SETGT_UINT T0.W, T0.Z, literal.w,
-; R600-NEXT:     EXP_IEEE * T1.X, PV.Z,
-; R600-NEXT:    -254(nan), 204(2.858649e-43)
-; R600-NEXT:    102(1.429324e-43), -229(nan)
-; R600-NEXT:     ADD_INT T2.X, T0.Z, literal.x,
-; R600-NEXT:     SETGT_UINT T1.Y, T0.Z, literal.y,
-; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
-; R600-NEXT:     SETGT_INT T1.W, T0.Z, literal.x,
-; R600-NEXT:     MUL_IEEE * T2.W, PS, literal.z,
-; R600-NEXT:    -127(nan), 254(3.559298e-43)
-; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T3.X, T1.X, literal.x,
-; R600-NEXT:     MUL_IEEE T0.Y, PS, literal.y,
-; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Z, T0.Z,
-; R600-NEXT:     CNDE_INT T3.W, PV.Y, PV.X, T0.X,
-; R600-NEXT:     SETGT_INT * T4.W, T0.Z, literal.z,
-; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
-; R600-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; R600-NEXT:     AND_INT T2.Y, KC0[4].X, literal.x,
-; R600-NEXT:     CNDE_INT T0.Z, PS, PV.Z, PV.W,
-; R600-NEXT:     CNDE_INT T0.W, T0.W, PV.Y, T2.W,
-; R600-NEXT:     MUL_IEEE * T2.W, PV.X, literal.y,
-; R600-NEXT:    -4096(nan), 2130706432(1.701412e+38)
-; R600-NEXT:     CNDE_INT T0.X, T1.Y, T3.X, PS,
-; R600-NEXT:     CNDE_INT T0.Y, T1.W, PV.W, T1.X,
-; R600-NEXT:     LSHL T0.Z, PV.Z, literal.x,
-; R600-NEXT:     ADD T0.W, KC0[4].X, -PV.Y,
-; R600-NEXT:     MUL_IEEE * T1.W, PV.Y, literal.y,
-; R600-NEXT:    23(3.222986e-44), 1069064192(1.442383e+00)
-; R600-NEXT:     RNDNE T1.Y, PS,
-; R600-NEXT:     MUL_IEEE T1.Z, PV.W, literal.x,
-; R600-NEXT:     ADD_INT T2.W, PV.Z, literal.y,
-; R600-NEXT:     CNDE_INT * T3.W, T4.W, PV.Y, PV.X,
-; R600-NEXT:    967029397(3.122284e-04), 1065353216(1.000000e+00)
-; R600-NEXT:     MUL_IEEE T0.Y, PS, PV.W,
-; R600-NEXT:     AND_INT T0.Z, KC0[3].W, literal.x,
-; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.y, PV.Z,
-; R600-NEXT:     TRUNC * T2.W, PV.Y,
-; R600-NEXT:    -4096(nan), 1069064192(1.442383e+00)
-; R600-NEXT:     SETGT T0.X, literal.x, KC0[3].Z,
-; R600-NEXT:     FLT_TO_INT T3.Y, PS,
-; R600-NEXT:     MULADD_IEEE T1.Z, T2.Y, literal.y, PV.W,
-; R600-NEXT:     ADD T0.W, T1.W, -T1.Y,
-; R600-NEXT:     MUL_IEEE * T1.W, PV.Z, literal.z,
-; R600-NEXT:    -1026650416(-1.032789e+02), 967029397(3.122284e-04)
-; R600-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
-; R600-NEXT:     RNDNE T1.X, PS,
-; R600-NEXT:     AND_INT T1.Y, KC0[3].Y, literal.x,
-; R600-NEXT:     ADD T1.Z, PV.W, PV.Z,
-; R600-NEXT:     MAX_INT T0.W, PV.Y, literal.y,
-; R600-NEXT:     MIN_INT * T2.W, PV.Y, literal.z,
-; R600-NEXT:    -4096(nan), -330(nan)
+; R600-NEXT:     ADD T0.W, PS, PV.W,
+; R600-NEXT:     TRUNC * T1.W, T4.W,
+; R600-NEXT:     FLT_TO_INT T1.W, PS,
+; R600-NEXT:     EXP_IEEE * T0.X, PV.W,
+; R600-NEXT:     MUL_IEEE T0.Z, PS, literal.x,
+; R600-NEXT:     MAX_INT T0.W, PV.W, literal.y,
+; R600-NEXT:     MIN_INT * T2.W, PV.W, literal.z,
+; R600-NEXT:    209715200(1.972152e-31), -330(nan)
 ; R600-NEXT:    381(5.338947e-43), 0(0.000000e+00)
-; R600-NEXT:     ADD_INT T2.X, PS, literal.x,
-; R600-NEXT:     ADD_INT T2.Y, PV.W, literal.y,
-; R600-NEXT:     ADD_INT T2.Z, T3.Y, literal.z,
-; R600-NEXT:     SETGT_UINT T0.W, T3.Y, literal.w,
-; R600-NEXT:     EXP_IEEE * T1.Z, PV.Z,
-; R600-NEXT:    -254(nan), 204(2.858649e-43)
-; R600-NEXT:    102(1.429324e-43), -229(nan)
-; R600-NEXT:     ADD_INT T3.X, T3.Y, literal.x,
-; R600-NEXT:     SETGT_UINT T4.Y, T3.Y, literal.y,
-; R600-NEXT:     CNDE_INT T2.Z, PV.W, PV.Y, PV.Z,
-; R600-NEXT:     SETGT_INT T2.W, T3.Y, literal.x,
-; R600-NEXT:     MUL_IEEE * T3.W, PS, literal.z,
+; R600-NEXT:     ADD_INT T1.X, PS, literal.x,
+; R600-NEXT:     AND_INT T0.Y, KC0[4].X, literal.y,
+; R600-NEXT:     ADD_INT T1.Z, PV.W, literal.z,
+; R600-NEXT:     ADD_INT * T0.W, T1.W, literal.w,
+; R600-NEXT:    -254(nan), -4096(nan)
+; R600-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; R600-NEXT:     SETGT_UINT * T2.W, T1.W, literal.x,
+; R600-NEXT:    -229(nan), 0(0.000000e+00)
+; R600-NEXT:     ADD_INT T2.X, T1.W, literal.x,
+; R600-NEXT:     SETGT_UINT T1.Y, T1.W, literal.y,
+; R600-NEXT:     CNDE_INT T1.Z, PV.W, T1.Z, T0.W,
+; R600-NEXT:     SETGT_INT T0.W, T1.W, literal.x,
+; R600-NEXT:     ADD * T3.W, KC0[4].X, -T0.Y,
 ; R600-NEXT:    -127(nan), 254(3.559298e-43)
-; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T4.X, T1.Z, literal.x,
-; R600-NEXT:     MUL_IEEE T2.Y, PS, literal.y,
-; R600-NEXT:     CNDE_INT T2.Z, PV.W, PV.Z, T3.Y,
-; R600-NEXT:     CNDE_INT T4.W, PV.Y, PV.X, T2.X,
-; R600-NEXT:     SETGT_INT * T5.W, T3.Y, literal.z,
-; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
+; R600-NEXT:     MUL_IEEE T3.X, PS, literal.x,
+; R600-NEXT:     MUL_IEEE T2.Y, T0.Y, literal.y,
+; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Z, T1.W,
+; R600-NEXT:     CNDE_INT T4.W, PV.Y, PV.X, T1.X,
+; R600-NEXT:     SETGT_INT * T1.W, T1.W, literal.z,
+; R600-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
 ; R600-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; R600-NEXT:     ADD T2.X, KC0[3].W, -T0.Z,
-; R600-NEXT:     CNDE_INT T3.Y, PS, PV.Z, PV.W,
-; R600-NEXT:     CNDE_INT * T2.Z, T0.W, PV.Y, T3.W,
-; R600-NEXT:    ALU clause starting at 105:
-; R600-NEXT:     MUL_IEEE T0.W, T4.X, literal.x,
-; R600-NEXT:     ADD * T3.W, KC0[3].Y, -T1.Y,
+; R600-NEXT:     CNDE_INT T1.X, PS, PV.Z, PV.W,
+; R600-NEXT:     RNDNE T3.Y, PV.Y,
+; R600-NEXT:     MULADD_IEEE T1.Z, T3.W, literal.x, PV.X,
+; R600-NEXT:     MUL_IEEE T3.W, T0.Z, literal.y,
+; R600-NEXT:     MUL_IEEE * T4.W, T0.X, literal.z,
+; R600-NEXT:    1069064192(1.442383e+00), 209715200(1.972152e-31)
 ; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T2.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T4.Y, T2.W, PV.W, T0.Z,
+; R600-NEXT:     MULADD_IEEE T0.Z, T0.Y, literal.y, PV.Z,
+; R600-NEXT:     ADD T2.W, T2.Y, -PV.Y, BS:VEC_120/SCL_212
+; R600-NEXT:     AND_INT * T3.W, KC0[3].Y, literal.z,
+; R600-NEXT:    2130706432(1.701412e+38), 967029397(3.122284e-04)
+; R600-NEXT:    -4096(nan), 0(0.000000e+00)
 ; R600-NEXT:     MUL_IEEE T3.X, PS, literal.x,
-; R600-NEXT:     MUL_IEEE T2.Y, T1.Y, literal.y,
-; R600-NEXT:     CNDE_INT T3.Z, T4.Y, T4.X, PV.W, BS:VEC_120/SCL_212
-; R600-NEXT:     CNDE_INT T0.W, T2.W, T2.Z, T1.Z,
-; R600-NEXT:     LSHL * T2.W, T3.Y, literal.z,
-; R600-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
-; R600-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; R600-NEXT:     ADD_INT T4.X, PS, literal.x,
-; R600-NEXT:     CNDE_INT T3.Y, T5.W, PV.W, PV.Z,
-; R600-NEXT:     RNDNE T1.Z, PV.Y,
-; R600-NEXT:     MULADD_IEEE T0.W, T3.W, literal.y, PV.X, BS:VEC_120/SCL_212
-; R600-NEXT:     MUL_IEEE * T2.W, T2.X, literal.z,
+; R600-NEXT:     ADD T0.Y, PV.W, PV.Z,
+; R600-NEXT:     CNDE_INT T0.Z, T0.W, PV.Y, T0.X, BS:VEC_021/SCL_122
+; R600-NEXT:     CNDE_INT T0.W, T1.Y, T4.W, PV.X,
+; R600-NEXT:     LSHL * T2.W, T1.X, literal.y,
+; R600-NEXT:    1069064192(1.442383e+00), 23(3.222986e-44)
+; R600-NEXT:     AND_INT T0.X, KC0[3].W, literal.x,
+; R600-NEXT:     TRUNC T1.Y, T3.Y,
+; R600-NEXT:     ADD_INT T1.Z, PS, literal.y,
+; R600-NEXT:     CNDE_INT T0.W, T1.W, PV.Z, PV.W,
+; R600-NEXT:     EXP_IEEE * T0.Y, PV.Y,
+; R600-NEXT:    -4096(nan), 1065353216(1.000000e+00)
+; R600-NEXT:     MUL_IEEE T1.X, PV.W, PV.Z,
+; R600-NEXT:     FLT_TO_INT T1.Y, PV.Y,
+; R600-NEXT:     MUL_IEEE T0.Z, PS, literal.x,
+; R600-NEXT:     ADD T0.W, KC0[3].W, -PV.X,
+; R600-NEXT:     RNDNE * T1.W, T3.X,
+; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; R600-NEXT:     SETGT T2.X, literal.x, KC0[3].Z,
+; R600-NEXT:     TRUNC T2.Y, PS,
+; R600-NEXT:     MUL_IEEE T1.Z, PV.W, literal.y,
+; R600-NEXT:     MUL_IEEE T2.W, PV.Z, literal.z,
+; R600-NEXT:     MAX_INT * T4.W, PV.Y, literal.w,
+; R600-NEXT:    -1026650416(-1.032789e+02), 967029397(3.122284e-04)
+; R600-NEXT:    209715200(1.972152e-31), -330(nan)
+; R600-NEXT:     ADD T4.X, KC0[3].Y, -T3.W,
+; R600-NEXT:     ADD_INT T3.Y, PS, literal.x,
+; R600-NEXT:     ADD_INT T2.Z, T1.Y, literal.y,
+; R600-NEXT:     SETGT_UINT T4.W, T1.Y, literal.z,
+; R600-NEXT:     MIN_INT * T5.W, T1.Y, literal.w,
+; R600-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; R600-NEXT:    -229(nan), 381(5.338947e-43)
+; R600-NEXT:     ADD_INT T5.X, PS, literal.x,
+; R600-NEXT:     ADD_INT T4.Y, T1.Y, literal.y,
+; R600-NEXT:     SETGT_UINT T3.Z, T1.Y, literal.z,
+; R600-NEXT:     CNDE_INT T5.W, PV.W, PV.Y, PV.Z,
+; R600-NEXT:     SETGT_INT * T6.W, T1.Y, literal.y,
+; R600-NEXT:    -254(nan), -127(nan)
+; R600-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T6.X, T0.Y, literal.x,
+; R600-NEXT:     CNDE_INT T3.Y, PS, PV.W, T1.Y,
+; R600-NEXT:     CNDE_INT * T2.Z, PV.Z, PV.Y, PV.X,
+; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
+; R600-NEXT:    ALU clause starting at 105:
+; R600-NEXT:     SETGT_INT T5.W, T1.Y, literal.x,
+; R600-NEXT:     MUL_IEEE * T7.W, T4.X, literal.y,
+; R600-NEXT:    127(1.779649e-43), 967029397(3.122284e-04)
+; R600-NEXT:     MUL_IEEE T5.X, T0.X, literal.x,
+; R600-NEXT:     MULADD_IEEE T1.Y, T4.X, literal.x, PS, BS:VEC_120/SCL_212
+; R600-NEXT:     CNDE_INT T2.Z, PV.W, T3.Y, T2.Z,
+; R600-NEXT:     MUL_IEEE T7.W, T6.X, literal.y, BS:VEC_201
+; R600-NEXT:     CNDE_INT * T2.W, T4.W, T2.W, T0.Z,
+; R600-NEXT:    1069064192(1.442383e+00), 2130706432(1.701412e+38)
+; R600-NEXT:     CNDE_INT T4.X, T6.W, PS, T0.Y,
+; R600-NEXT:     CNDE_INT T0.Y, T3.Z, T6.X, PV.W,
+; R600-NEXT:     LSHL T0.Z, PV.Z, literal.x,
+; R600-NEXT:     MULADD_IEEE T2.W, T3.W, literal.y, PV.Y, BS:VEC_201
+; R600-NEXT:     ADD * T1.W, T3.X, -T1.W,
+; R600-NEXT:    23(3.222986e-44), 967029397(3.122284e-04)
+; R600-NEXT:     ADD T3.X, PS, PV.W,
+; R600-NEXT:     ADD_INT T1.Y, PV.Z, literal.x,
+; R600-NEXT:     CNDE_INT T0.Z, T5.W, PV.X, PV.Y,
+; R600-NEXT:     RNDNE T1.W, T5.X,
+; R600-NEXT:     MULADD_IEEE * T0.W, T0.W, literal.y, T1.Z, BS:VEC_021/SCL_122
 ; R600-NEXT:    1065353216(1.000000e+00), 1069064192(1.442383e+00)
-; R600-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; R600-NEXT:     MULADD_IEEE T2.X, T2.X, literal.x, PS,
-; R600-NEXT:     MULADD_IEEE T1.Y, T1.Y, literal.y, PV.W,
-; R600-NEXT:     ADD T2.Z, T2.Y, -PV.Z, BS:VEC_120/SCL_212
-; R600-NEXT:     MUL_IEEE T0.W, PV.Y, PV.X,
-; R600-NEXT:     SETGT * T2.W, literal.z, KC0[4].X,
-; R600-NEXT:    1069064192(1.442383e+00), 967029397(3.122284e-04)
-; R600-NEXT:    -1026650416(-1.032789e+02), 0(0.000000e+00)
-; R600-NEXT:     CNDE T3.X, PS, PV.W, 0.0,
-; R600-NEXT:     ADD T1.Y, PV.Z, PV.Y,
-; R600-NEXT:     TRUNC T1.Z, T1.Z,
-; R600-NEXT:     MULADD_IEEE T0.W, T0.Z, literal.x, PV.X, BS:VEC_120/SCL_212
-; R600-NEXT:     ADD * T1.W, T1.W, -T1.X,
-; R600-NEXT:    967029397(3.122284e-04), 0(0.000000e+00)
-; R600-NEXT:     SETGT T2.X, KC0[4].X, literal.x,
-; R600-NEXT:     ADD T2.Y, PS, PV.W,
-; R600-NEXT:     FLT_TO_INT T0.Z, PV.Z,
-; R600-NEXT:     TRUNC T0.W, T1.X,
-; R600-NEXT:     EXP_IEEE * T1.X, PV.Y,
-; R600-NEXT:    1118925336(8.872284e+01), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T4.X, PS, literal.x,
-; R600-NEXT:     FLT_TO_INT T1.Y, PV.W,
-; R600-NEXT:     MAX_INT T1.Z, PV.Z, literal.y,
-; R600-NEXT:     MUL_IEEE T0.W, PS, literal.z,
-; R600-NEXT:     EXP_IEEE * T1.W, PV.Y,
-; R600-NEXT:    2130706432(1.701412e+38), -330(nan)
+; R600-NEXT:     MULADD_IEEE T0.X, T0.X, literal.x, PS,
+; R600-NEXT:     ADD T0.Y, T5.X, -PV.W, BS:VEC_120/SCL_212
+; R600-NEXT:     MUL_IEEE T0.Z, PV.Z, PV.Y,
+; R600-NEXT:     SETGT T0.W, literal.y, KC0[4].X,
+; R600-NEXT:     EXP_IEEE * T1.Y, PV.X,
+; R600-NEXT:    967029397(3.122284e-04), -1026650416(-1.032789e+02)
+; R600-NEXT:     CNDE T3.X, PV.W, PV.Z, 0.0,
+; R600-NEXT:     ADD T0.Y, PV.Y, PV.X,
+; R600-NEXT:     FLT_TO_INT T0.Z, T2.Y,
+; R600-NEXT:     TRUNC T0.W, T1.W,
+; R600-NEXT:     MUL_IEEE * T1.W, PS, literal.x,
 ; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T5.X, PV.W, literal.x,
-; R600-NEXT:     MUL_IEEE T2.Y, PS, literal.x,
-; R600-NEXT:     ADD_INT T1.Z, PV.Z, literal.y,
-; R600-NEXT:     ADD_INT T2.W, T0.Z, literal.z,
-; R600-NEXT:     MAX_INT * T3.W, PV.Y, literal.w,
-; R600-NEXT:    209715200(1.972152e-31), 204(2.858649e-43)
-; R600-NEXT:    102(1.429324e-43), -330(nan)
-; R600-NEXT:     SETGT_UINT T6.X, T0.Z, literal.x,
-; R600-NEXT:     ADD_INT T3.Y, PS, literal.y,
-; R600-NEXT:     ADD_INT T2.Z, T1.Y, literal.z,
-; R600-NEXT:     SETGT_UINT T3.W, T1.Y, literal.x,
-; R600-NEXT:     MIN_INT * T4.W, T1.Y, literal.w,
+; R600-NEXT:     SETGT T0.X, KC0[4].X, literal.x,
+; R600-NEXT:     MUL_IEEE T2.Y, PS, literal.y,
+; R600-NEXT:     FLT_TO_INT T1.Z, PV.W,
+; R600-NEXT:     MAX_INT T0.W, PV.Z, literal.z,
+; R600-NEXT:     EXP_IEEE * T0.Y, PV.Y,
+; R600-NEXT:    1118925336(8.872284e+01), 209715200(1.972152e-31)
+; R600-NEXT:    -330(nan), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T4.X, T1.Y, literal.x,
+; R600-NEXT:     MUL_IEEE T3.Y, PS, literal.y,
+; R600-NEXT:     ADD_INT T2.Z, PV.W, literal.z,
+; R600-NEXT:     ADD_INT * T0.W, T0.Z, literal.w,
+; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
+; R600-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; R600-NEXT:     MAX_INT * T2.W, T1.Z, literal.x,
+; R600-NEXT:    -330(nan), 0(0.000000e+00)
+; R600-NEXT:     SETGT_UINT T5.X, T0.Z, literal.x,
+; R600-NEXT:     ADD_INT T4.Y, PV.W, literal.y,
+; R600-NEXT:     ADD_INT T3.Z, T1.Z, literal.z, BS:VEC_120/SCL_212
+; R600-NEXT:     SETGT_UINT T2.W, T1.Z, literal.x, BS:VEC_120/SCL_212
+; R600-NEXT:     MIN_INT * T3.W, T1.Z, literal.w,
 ; R600-NEXT:    -229(nan), 204(2.858649e-43)
 ; R600-NEXT:    102(1.429324e-43), 381(5.338947e-43)
-; R600-NEXT:     ADD_INT T7.X, PS, literal.x,
-; R600-NEXT:     ADD_INT T4.Y, T1.Y, literal.y,
-; R600-NEXT:     SETGT_UINT T3.Z, T1.Y, literal.z,
-; R600-NEXT:     CNDE_INT T4.W, PV.W, PV.Y, PV.Z,
-; R600-NEXT:     SETGT_INT * T5.W, T1.Y, literal.y,
+; R600-NEXT:     ADD_INT T6.X, PS, literal.x,
+; R600-NEXT:     ADD_INT T5.Y, T1.Z, literal.y,
+; R600-NEXT:     SETGT_UINT T4.Z, T1.Z, literal.z,
+; R600-NEXT:     CNDE_INT T3.W, PV.W, PV.Y, PV.Z,
+; R600-NEXT:     SETGT_INT * T4.W, T1.Z, literal.y,
 ; R600-NEXT:    -254(nan), -127(nan)
 ; R600-NEXT:    254(3.559298e-43), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T8.X, PS, PV.W, T1.Y,
-; R600-NEXT:     CNDE_INT T3.Y, PV.Z, PV.Y, PV.X,
-; R600-NEXT:     SETGT_INT T2.Z, T1.Y, literal.x,
-; R600-NEXT:     CNDE_INT T2.W, T6.X, T1.Z, T2.W,
-; R600-NEXT:     SETGT_INT * T4.W, T0.Z, literal.y,
+; R600-NEXT:     CNDE_INT T7.X, PS, PV.W, T1.Z, BS:VEC_021/SCL_122
+; R600-NEXT:     CNDE_INT T4.Y, PV.Z, PV.Y, PV.X,
+; R600-NEXT:     SETGT_INT T1.Z, T1.Z, literal.x, BS:VEC_120/SCL_212
+; R600-NEXT:     CNDE_INT T0.W, T5.X, T2.Z, T0.W, BS:VEC_102/SCL_221
+; R600-NEXT:     SETGT_INT * T3.W, T0.Z, literal.y,
 ; R600-NEXT:    127(1.779649e-43), -127(nan)
-; R600-NEXT:     CNDE_INT T7.X, PS, PV.W, T0.Z,
-; R600-NEXT:     CNDE_INT T1.Y, PV.Z, PV.X, PV.Y,
-; R600-NEXT:     MIN_INT T1.Z, T0.Z, literal.x,
-; R600-NEXT:     MUL_IEEE T2.W, T1.W, literal.y,
-; R600-NEXT:     MUL_IEEE * T6.W, T2.Y, literal.z,
-; R600-NEXT:    381(5.338947e-43), 2130706432(1.701412e+38)
-; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T8.X, T3.W, PS, T2.Y,
-; R600-NEXT:     MUL_IEEE T2.Y, PV.W, literal.x,
-; R600-NEXT:     ADD_INT T1.Z, PV.Z, literal.y,
-; R600-NEXT:     ADD_INT T3.W, T0.Z, literal.z,
-; R600-NEXT:     SETGT_UINT * T6.W, T0.Z, literal.w,
+; R600-NEXT:     CNDE_INT T6.X, PS, PV.W, T0.Z,
+; R600-NEXT:     CNDE_INT T4.Y, PV.Z, PV.X, PV.Y,
+; R600-NEXT:     MIN_INT T2.Z, T0.Z, literal.x,
+; R600-NEXT:     MUL_IEEE T0.W, T3.Y, literal.y,
+; R600-NEXT:     MUL_IEEE * T5.W, T0.Y, literal.z,
+; R600-NEXT:    381(5.338947e-43), 209715200(1.972152e-31)
+; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T7.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T3.Y, T2.W, PV.W, T3.Y,
+; R600-NEXT:     ADD_INT T2.Z, PV.Z, literal.y,
+; R600-NEXT:     ADD_INT T0.W, T0.Z, literal.z,
+; R600-NEXT:     SETGT_UINT * T2.W, T0.Z, literal.w,
 ; R600-NEXT:    2130706432(1.701412e+38), -254(nan)
 ; R600-NEXT:    -127(nan), 254(3.559298e-43)
-; R600-NEXT:     CNDE_INT T9.X, PS, PV.W, PV.Z,
-; R600-NEXT:     SETGT_INT T3.Y, T0.Z, literal.x,
-; R600-NEXT:     CNDE_INT T0.Z, T3.Z, T2.W, PV.Y, BS:VEC_120/SCL_212
-; R600-NEXT:     CNDE_INT T1.W, T5.W, PV.X, T1.W, BS:VEC_021/SCL_122
-; R600-NEXT:     LSHL * T2.W, T1.Y, literal.y,
+; R600-NEXT:     CNDE_INT T8.X, PS, PV.W, PV.Z,
+; R600-NEXT:     SETGT_INT T5.Y, T0.Z, literal.x,
+; R600-NEXT:     CNDE_INT T0.Z, T4.W, PV.Y, T0.Y, BS:VEC_021/SCL_122
+; R600-NEXT:     CNDE_INT T0.W, T4.Z, T5.W, PV.X, BS:VEC_120/SCL_212
+; R600-NEXT:     LSHL * T4.W, T4.Y, literal.y,
 ; R600-NEXT:    127(1.779649e-43), 23(3.222986e-44)
-; R600-NEXT:     ADD_INT T8.X, PS, literal.x,
-; R600-NEXT:     CNDE_INT T1.Y, T2.Z, PV.W, PV.Z,
-; R600-NEXT:     CNDE_INT T0.Z, PV.Y, T7.X, PV.X,
-; R600-NEXT:     CNDE_INT * T0.W, T6.X, T5.X, T0.W, BS:VEC_021/SCL_122
-; R600-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE * T1.W, T4.X, literal.x,
-; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T4.X, T6.W, T4.X, PV.W,
-; R600-NEXT:     CNDE_INT * T2.Y, T4.W, T0.W, T1.X, BS:VEC_120/SCL_212
-; R600-NEXT:    ALU clause starting at 204:
+; R600-NEXT:     ADD_INT T7.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T0.Y, T1.Z, PV.Z, PV.W,
+; R600-NEXT:     CNDE_INT T0.Z, PV.Y, T6.X, PV.X,
+; R600-NEXT:     MUL_IEEE T0.W, T4.X, literal.y,
+; R600-NEXT:     CNDE_INT * T1.W, T5.X, T2.Y, T1.W,
+; R600-NEXT:    1065353216(1.000000e+00), 2130706432(1.701412e+38)
+; R600-NEXT:     CNDE_INT T5.X, T3.W, PS, T1.Y,
+; R600-NEXT:     CNDE_INT * T1.Y, T2.W, T4.X, PV.W, BS:VEC_120/SCL_212
+; R600-NEXT:    ALU clause starting at 201:
 ; R600-NEXT:     LSHL T0.Z, T0.Z, literal.x,
-; R600-NEXT:     MUL_IEEE T0.W, T1.Y, T8.X,
+; R600-NEXT:     MUL_IEEE T0.W, T0.Y, T7.X,
 ; R600-NEXT:     SETGT * T1.W, literal.y, KC0[3].W,
 ; R600-NEXT:    23(3.222986e-44), -1026650416(-1.032789e+02)
-; R600-NEXT:     CNDE T1.X, PS, PV.W, 0.0,
-; R600-NEXT:     SETGT T1.Y, KC0[3].W, literal.x,
+; R600-NEXT:     CNDE T4.X, PS, PV.W, 0.0,
+; R600-NEXT:     SETGT T0.Y, KC0[3].W, literal.x,
 ; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
-; R600-NEXT:     CNDE_INT T0.W, T3.Y, T2.Y, T4.X, BS:VEC_120/SCL_212
-; R600-NEXT:     CNDE * T1.W, T2.X, T3.X, literal.z,
+; R600-NEXT:     CNDE_INT T0.W, T5.Y, T5.X, T1.Y, BS:VEC_102/SCL_221
+; R600-NEXT:     CNDE * T1.W, T0.X, T3.X, literal.z,
 ; R600-NEXT:    1118925336(8.872284e+01), 1065353216(1.000000e+00)
 ; R600-NEXT:    2139095040(INF), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T2.X, PV.W, PV.Z,
+; R600-NEXT:     MUL_IEEE T0.X, PV.W, PV.Z,
 ; R600-NEXT:     SETGT T2.Y, literal.x, KC0[3].Y,
 ; R600-NEXT:     CNDE T1.Z, PV.Y, PV.X, literal.y,
-; R600-NEXT:     CNDE T0.W, T0.X, T0.Y, 0.0,
+; R600-NEXT:     CNDE T0.W, T2.X, T1.X, 0.0,
 ; R600-NEXT:     SETGT * T2.W, KC0[3].Z, literal.z,
 ; R600-NEXT:    -1026650416(-1.032789e+02), 2139095040(INF)
 ; R600-NEXT:    1118925336(8.872284e+01), 0(0.000000e+00)
@@ -2285,8 +2273,8 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; CM-LABEL: s_exp_v4f32:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 97, @6, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    ALU 100, @104, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    ALU 36, @205, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 97, @104, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 35, @202, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
@@ -2305,224 +2293,220 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
 ; CM-NEXT:     MULADD_IEEE T0.X, T0.W, literal.x, PV.W,
 ; CM-NEXT:     ADD T0.Y, T0.Z, -PV.Z,
-; CM-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.x,
-; CM-NEXT:     MUL_IEEE * T0.W, T2.W, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     MUL_IEEE T0.Z, T2.W, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     MUL_IEEE * T0.W, PV.Y, literal.x,
 ; CM-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
 ; CM-NEXT:     TRUNC T1.X, T1.Z,
-; CM-NEXT:     RNDNE T2.Y, PV.W,
-; CM-NEXT:     MULADD_IEEE T0.Z, T1.Y, literal.x, PV.Z,
-; CM-NEXT:     ADD * T1.W, PV.Y, PV.X,
+; CM-NEXT:     MULADD_IEEE T1.Y, T1.Y, literal.x, PV.W,
+; CM-NEXT:     RNDNE T1.Z, PV.Z,
+; CM-NEXT:     ADD * T0.W, PV.Y, PV.X,
 ; CM-NEXT:    1069064192(1.442383e+00), 0(0.000000e+00)
+; CM-NEXT:     EXP_IEEE T0.X, T0.W,
+; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
+; CM-NEXT:     TRUNC T2.X, T1.Z,
+; CM-NEXT:     MULADD_IEEE T0.Y, T2.W, literal.x, T1.Y,
+; CM-NEXT:     FLT_TO_INT T2.Z, T1.X,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.y,
+; CM-NEXT:    967029397(3.122284e-04), 209715200(1.972152e-31)
+; CM-NEXT:     ADD T1.X, T0.Z, -T1.Z,
+; CM-NEXT:     MUL_IEEE T1.Y, PV.W, literal.x,
+; CM-NEXT:     MAX_INT T0.Z, PV.Z, literal.y,
+; CM-NEXT:     MIN_INT * T1.W, PV.Z, literal.z,
+; CM-NEXT:    209715200(1.972152e-31), -330(nan)
+; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T3.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T0.Z, T2.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T1.W, T2.Z, literal.w,
+; CM-NEXT:    -254(nan), 204(2.858649e-43)
+; CM-NEXT:    102(1.429324e-43), -229(nan)
+; CM-NEXT:     ADD_INT T4.X, T2.Z, literal.x,
+; CM-NEXT:     SETGT_UINT T3.Y, T2.Z, literal.y,
+; CM-NEXT:     CNDE_INT T0.Z, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     SETGT_INT * T2.W, T2.Z, literal.x,
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     MUL_IEEE T5.X, T0.X, literal.x,
+; CM-NEXT:     CNDE_INT T2.Y, PV.W, PV.Z, T2.Z,
+; CM-NEXT:     CNDE_INT T0.Z, PV.Y, PV.X, T3.X,
+; CM-NEXT:     SETGT_INT * T3.W, T2.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 127(1.779649e-43)
+; CM-NEXT:     AND_INT T3.X, KC0[3].Z, literal.x,
+; CM-NEXT:     CNDE_INT T2.Y, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.X, literal.y,
+; CM-NEXT:     CNDE_INT * T0.W, T1.W, T1.Y, T0.W,
+; CM-NEXT:    -4096(nan), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T0.X, T2.W, PV.W, T0.X,
+; CM-NEXT:     CNDE_INT T1.Y, T3.Y, T5.X, PV.Z,
+; CM-NEXT:     LSHL T0.Z, PV.Y, literal.x,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.y,
+; CM-NEXT:    23(3.222986e-44), 1069064192(1.442383e+00)
+; CM-NEXT:     RNDNE T4.X, PV.W,
+; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.x,
+; CM-NEXT:     CNDE_INT T0.Z, T3.W, PV.X, PV.Y,
+; CM-NEXT:     ADD * T1.W, T1.X, T0.Y,
+; CM-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
 ; CM-NEXT:     EXP_IEEE T0.X, T1.W,
 ; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T1.W,
 ; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
 ; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
-; CM-NEXT:     MULADD_IEEE T2.X, T2.W, literal.x, T0.Z,
-; CM-NEXT:     ADD T0.Y, T0.W, -T2.Y, BS:VEC_120/SCL_212
-; CM-NEXT:     FLT_TO_INT T0.Z, T1.X,
-; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.y,
-; CM-NEXT:    967029397(3.122284e-04), 209715200(1.972152e-31)
-; CM-NEXT:     MUL_IEEE T1.X, PV.W, literal.x,
+; CM-NEXT:     MUL_IEEE T1.X, T0.Z, T2.Y,
+; CM-NEXT:     TRUNC T0.Y, T4.X,
+; CM-NEXT:     FLT_TO_INT T0.Z, T2.X, BS:VEC_120/SCL_212
+; CM-NEXT:     MUL_IEEE * T1.W, PV.X, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T2.X, PV.W, literal.x,
 ; CM-NEXT:     MUL_IEEE T1.Y, T0.X, literal.y,
 ; CM-NEXT:     MAX_INT T1.Z, PV.Z, literal.z,
-; CM-NEXT:     MIN_INT * T1.W, PV.Z, literal.w,
+; CM-NEXT:     MIN_INT * T2.W, PV.Z, literal.w,
 ; CM-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
 ; CM-NEXT:    -330(nan), 381(5.338947e-43)
-; CM-NEXT:     ADD_INT T3.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T3.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T5.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.y,
 ; CM-NEXT:     ADD_INT T1.Z, T0.Z, literal.z,
-; CM-NEXT:     SETGT_UINT * T1.W, T0.Z, literal.w,
+; CM-NEXT:     SETGT_UINT * T2.W, T0.Z, literal.w,
 ; CM-NEXT:    -254(nan), 204(2.858649e-43)
 ; CM-NEXT:    102(1.429324e-43), -229(nan)
-; CM-NEXT:     ADD_INT T4.X, T0.Z, literal.x,
-; CM-NEXT:     SETGT_UINT T4.Y, T0.Z, literal.y,
+; CM-NEXT:     ADD_INT T6.X, T0.Z, literal.x,
+; CM-NEXT:     SETGT_UINT T3.Y, T0.Z, literal.y,
 ; CM-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     SETGT_INT * T2.W, T0.Z, literal.x,
+; CM-NEXT:     SETGT_INT * T3.W, T0.Z, literal.x,
 ; CM-NEXT:    -127(nan), 254(3.559298e-43)
-; CM-NEXT:     CNDE_INT T5.X, PV.W, PV.Z, T0.Z,
-; CM-NEXT:     CNDE_INT T3.Y, PV.Y, PV.X, T3.X,
-; CM-NEXT:     SETGT_INT T0.Z, T0.Z, literal.x,
-; CM-NEXT:     MUL_IEEE * T3.W, T1.Y, literal.y,
-; CM-NEXT:    127(1.779649e-43), 2130706432(1.701412e+38)
-; CM-NEXT:     CNDE_INT T3.X, T4.Y, T1.Y, PV.W,
-; CM-NEXT:     AND_INT T1.Y, KC0[3].Z, literal.x,
-; CM-NEXT:     CNDE_INT T1.Z, PV.Z, PV.X, PV.Y,
-; CM-NEXT:     CNDE_INT * T0.W, T1.W, T1.X, T0.W,
-; CM-NEXT:    -4096(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T0.X, T2.W, PV.W, T0.X,
-; CM-NEXT:     LSHL T3.Y, PV.Z, literal.x,
-; CM-NEXT:     TRUNC T1.Z, T2.Y,
-; CM-NEXT:     ADD * T0.W, KC0[3].Z, -PV.Y,
-; CM-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T1.X, PV.W, literal.x,
-; CM-NEXT:     FLT_TO_INT T2.Y, PV.Z,
-; CM-NEXT:     ADD_INT T1.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T1.W, T0.Z, PV.X, T3.X,
-; CM-NEXT:    967029397(3.122284e-04), 1065353216(1.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.X, PV.W, PV.Z,
-; CM-NEXT:     MIN_INT T3.Y, PV.Y, literal.x,
-; CM-NEXT:     MULADD_IEEE T0.Z, T0.W, literal.y, PV.X,
-; CM-NEXT:     ADD * T0.W, T0.Y, T2.X,
-; CM-NEXT:    381(5.338947e-43), 1069064192(1.442383e+00)
-; CM-NEXT:     EXP_IEEE T0.X (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE T0.Y, T0.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
-; CM-NEXT:     MULADD_IEEE T1.X, T1.Y, literal.x, T0.Z,
-; CM-NEXT:     MUL_IEEE T4.Y, PV.Y, literal.y,
-; CM-NEXT:     ADD_INT T0.Z, T3.Y, literal.z, BS:VEC_120/SCL_212
-; CM-NEXT:     MAX_INT * T0.W, T2.Y, literal.w, BS:VEC_201
-; CM-NEXT:    967029397(3.122284e-04), 2130706432(1.701412e+38)
-; CM-NEXT:    -254(nan), -330(nan)
-; CM-NEXT:     ADD_INT T2.X, T2.Y, literal.x,
-; CM-NEXT:     ADD_INT T3.Y, PV.W, literal.y,
-; CM-NEXT:     ADD_INT T1.Z, T2.Y, literal.z,
-; CM-NEXT:     SETGT_UINT * T0.W, T2.Y, literal.w,
-; CM-NEXT:    -127(nan), 204(2.858649e-43)
-; CM-NEXT:    102(1.429324e-43), -229(nan)
-; CM-NEXT:     SETGT_UINT T3.X, T2.Y, literal.x,
-; CM-NEXT:     CNDE_INT T3.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     SETGT_INT T1.Z, T2.Y, literal.y,
-; CM-NEXT:     MUL_IEEE * T1.W, T0.Y, literal.z, BS:VEC_120/SCL_212
-; CM-NEXT:    254(3.559298e-43), -127(nan)
-; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T4.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT * T3.Y, PV.Z, PV.Y, T2.Y,
-; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; CM-NEXT:    ALU clause starting at 104:
-; CM-NEXT:     CNDE_INT T0.Z, T3.X, T2.X, T0.Z,
-; CM-NEXT:     SETGT_INT * T2.W, T2.Y, literal.x,
+; CM-NEXT:     CNDE_INT T7.X, PV.W, PV.Z, T0.Z,
+; CM-NEXT:     CNDE_INT T2.Y, PV.Y, PV.X, T5.X,
+; CM-NEXT:     SETGT_INT * T0.Z, T0.Z, literal.x,
 ; CM-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T2.X, T1.Y, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.W, T3.Y, PV.Z,
-; CM-NEXT:     CNDE_INT T0.Z, T0.W, T4.X, T1.W,
-; CM-NEXT:     MUL_IEEE * T0.W, T4.Y, literal.y, BS:VEC_201
-; CM-NEXT:    1069064192(1.442383e+00), 2130706432(1.701412e+38)
-; CM-NEXT:     AND_INT T4.X, KC0[4].X, literal.x,
-; CM-NEXT:     CNDE_INT T2.Y, T3.X, T4.Y, PV.W,
-; CM-NEXT:     CNDE_INT T0.Z, T1.Z, PV.Z, T0.Y,
-; CM-NEXT:     LSHL * T0.W, PV.Y, literal.y,
-; CM-NEXT:    -4096(nan), 23(3.222986e-44)
-; CM-NEXT:     ADD_INT T3.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T0.Y, T2.W, PV.Z, PV.Y,
-; CM-NEXT:     MUL_IEEE T0.Z, PV.X, literal.y,
-; CM-NEXT:     RNDNE * T0.W, T2.X,
-; CM-NEXT:    1065353216(1.000000e+00), 1069064192(1.442383e+00)
-; CM-NEXT:     ADD T2.X, T2.X, -PV.W,
-; CM-NEXT:     RNDNE T1.Y, PV.Z,
-; CM-NEXT:     MUL_IEEE T1.Z, PV.Y, PV.X,
-; CM-NEXT:     SETGT * T1.W, literal.x, KC0[3].W,
-; CM-NEXT:    -1026650416(-1.032789e+02), 0(0.000000e+00)
-; CM-NEXT:     CNDE T3.X, PV.W, PV.Z, 0.0,
-; CM-NEXT:     TRUNC T0.Y, T0.W,
-; CM-NEXT:     TRUNC T1.Z, PV.Y,
-; CM-NEXT:     ADD * T0.W, PV.X, T1.X,
+; CM-NEXT:    ALU clause starting at 104:
+; CM-NEXT:     ADD * T4.W, KC0[3].Z, -T3.X,
+; CM-NEXT:     MUL_IEEE T5.X, PV.W, literal.x,
+; CM-NEXT:     CNDE_INT T2.Y, T0.Z, T7.X, T2.Y,
+; CM-NEXT:     MUL_IEEE T1.Z, T1.Y, literal.y,
+; CM-NEXT:     CNDE_INT * T1.W, T2.W, T2.X, T1.W, BS:VEC_021/SCL_122
+; CM-NEXT:    967029397(3.122284e-04), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T0.X, T3.W, PV.W, T0.X,
+; CM-NEXT:     CNDE_INT T1.Y, T3.Y, T1.Y, PV.Z,
+; CM-NEXT:     LSHL T1.Z, PV.Y, literal.x,
+; CM-NEXT:     MULADD_IEEE * T1.W, T4.W, literal.y, PV.X, BS:VEC_120/SCL_212
+; CM-NEXT:    23(3.222986e-44), 1069064192(1.442383e+00)
+; CM-NEXT:     MULADD_IEEE T2.X, T3.X, literal.x, PV.W,
+; CM-NEXT:     ADD T2.Y, T0.W, -T4.X,
+; CM-NEXT:     ADD_INT T1.Z, PV.Z, literal.y,
+; CM-NEXT:     CNDE_INT * T0.W, T0.Z, PV.X, PV.Y,
+; CM-NEXT:    967029397(3.122284e-04), 1065353216(1.000000e+00)
+; CM-NEXT:     AND_INT T0.X, KC0[4].X, literal.x,
+; CM-NEXT:     MUL_IEEE T1.Y, PV.W, PV.Z,
+; CM-NEXT:     SETGT T0.Z, literal.y, KC0[3].W,
+; CM-NEXT:     ADD * T0.W, PV.Y, PV.X,
+; CM-NEXT:    -4096(nan), -1026650416(-1.032789e+02)
 ; CM-NEXT:     EXP_IEEE T0.X (MASKED), T0.W,
 ; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
 ; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
 ; CM-NEXT:     EXP_IEEE * T0.W, T0.W,
-; CM-NEXT:     FLT_TO_INT T1.X, T1.Z,
-; CM-NEXT:     FLT_TO_INT T0.Y, T0.Y,
-; CM-NEXT:     MUL_IEEE T1.Z, PV.W, literal.x,
-; CM-NEXT:     ADD * T1.W, KC0[4].X, -T4.X,
-; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T2.X, PV.W, literal.x,
-; CM-NEXT:     MUL_IEEE T2.Y, T0.W, literal.y,
-; CM-NEXT:     MUL_IEEE T2.Z, PV.Z, literal.z,
-; CM-NEXT:     SETGT_UINT * T2.W, PV.Y, literal.w,
-; CM-NEXT:    967029397(3.122284e-04), 209715200(1.972152e-31)
-; CM-NEXT:    2130706432(1.701412e+38), 254(3.559298e-43)
-; CM-NEXT:     CNDE_INT T5.X, PV.W, T1.Z, PV.Z,
-; CM-NEXT:     MUL_IEEE T3.Y, PV.Y, literal.x,
-; CM-NEXT:     MULADD_IEEE T1.Z, T1.W, literal.y, PV.X,
-; CM-NEXT:     MAX_INT * T1.W, T1.X, literal.z,
-; CM-NEXT:    209715200(1.972152e-31), 1069064192(1.442383e+00)
-; CM-NEXT:    -330(nan), 0(0.000000e+00)
-; CM-NEXT:     ADD_INT T2.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T4.Y, T1.X, literal.y,
-; CM-NEXT:     MULADD_IEEE T1.Z, T4.X, literal.z, PV.Z, BS:VEC_120/SCL_212
-; CM-NEXT:     MAX_INT * T1.W, T0.Y, literal.w,
-; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; CM-NEXT:     CNDE T2.X, T0.Z, T1.Y, 0.0,
+; CM-NEXT:     ADD T1.Y, KC0[4].X, -T0.X,
+; CM-NEXT:     FLT_TO_INT T0.Z, T0.Y,
+; CM-NEXT:     MUL_IEEE * T1.W, PV.W, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T3.X, PV.W, literal.x,
+; CM-NEXT:     SETGT_UINT T0.Y, PV.Z, literal.y,
+; CM-NEXT:     MUL_IEEE T1.Z, PV.Y, literal.z,
+; CM-NEXT:     MUL_IEEE * T2.W, T0.X, literal.w,
+; CM-NEXT:    209715200(1.972152e-31), -229(nan)
+; CM-NEXT:    967029397(3.122284e-04), 1069064192(1.442383e+00)
+; CM-NEXT:     RNDNE T4.X, PV.W,
+; CM-NEXT:     MULADD_IEEE T1.Y, T1.Y, literal.x, PV.Z,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Y, PV.X, T1.W,
+; CM-NEXT:     SETGT_INT * T1.W, T0.Z, literal.y,
+; CM-NEXT:    1069064192(1.442383e+00), -127(nan)
+; CM-NEXT:     CNDE_INT T3.X, PV.W, PV.Z, T0.W,
+; CM-NEXT:     MULADD_IEEE T1.Y, T0.X, literal.x, PV.Y,
+; CM-NEXT:     ADD T1.Z, T2.W, -PV.X,
+; CM-NEXT:     MAX_INT * T2.W, T0.Z, literal.y,
 ; CM-NEXT:    967029397(3.122284e-04), -330(nan)
-; CM-NEXT:     ADD T4.X, T0.Z, -T1.Y,
-; CM-NEXT:     ADD_INT T1.Y, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, T0.Y, literal.y,
-; CM-NEXT:     SETGT_UINT * T1.W, T0.Y, literal.z,
+; CM-NEXT:     ADD_INT T0.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T2.Y, T0.Z, literal.y,
+; CM-NEXT:     TRUNC T2.Z, T4.X,
+; CM-NEXT:     ADD * T2.W, PV.Z, PV.Y,
 ; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
-; CM-NEXT:    -229(nan), 0(0.000000e+00)
-; CM-NEXT:     SETGT_UINT T6.X, T1.X, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     SETGT_INT T0.Z, T0.Y, literal.y,
-; CM-NEXT:     ADD * T3.W, PV.X, T1.Z,
-; CM-NEXT:    -229(nan), -127(nan)
-; CM-NEXT:     EXP_IEEE T1.X (MASKED), T3.W,
-; CM-NEXT:     EXP_IEEE T1.Y (MASKED), T3.W,
-; CM-NEXT:     EXP_IEEE T1.Z, T3.W,
-; CM-NEXT:     EXP_IEEE * T1.W (MASKED), T3.W,
-; CM-NEXT:     CNDE_INT T4.X, T0.Z, T1.Y, T0.Y,
-; CM-NEXT:     CNDE_INT T1.Y, T6.X, T2.X, T4.Y, BS:VEC_120/SCL_212
-; CM-NEXT:     SETGT_INT T2.Z, T1.X, literal.x,
-; CM-NEXT:     MUL_IEEE * T3.W, PV.Z, literal.y,
-; CM-NEXT:    -127(nan), 209715200(1.972152e-31)
-; CM-NEXT:     MUL_IEEE T2.X, T1.Z, literal.x,
-; CM-NEXT:     MUL_IEEE T4.Y, PV.W, literal.y,
-; CM-NEXT:     CNDE_INT T3.Z, PV.Z, PV.Y, T1.X,
-; CM-NEXT:     MIN_INT * T4.W, T1.X, literal.z,
+; CM-NEXT:     EXP_IEEE T1.X (MASKED), T2.W,
+; CM-NEXT:     EXP_IEEE T1.Y, T2.W,
+; CM-NEXT:     EXP_IEEE T1.Z (MASKED), T2.W,
+; CM-NEXT:     EXP_IEEE * T1.W (MASKED), T2.W,
+; CM-NEXT:     MUL_IEEE T4.X, T0.W, literal.x,
+; CM-NEXT:     FLT_TO_INT T3.Y, T2.Z,
+; CM-NEXT:     MUL_IEEE T1.Z, PV.Y, literal.y,
+; CM-NEXT:     CNDE_INT * T0.W, T0.Y, T0.X, T2.Y,
 ; CM-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
+; CM-NEXT:     CNDE_INT T0.X, T1.W, PV.W, T0.Z,
+; CM-NEXT:     MUL_IEEE T0.Y, PV.Z, literal.x,
+; CM-NEXT:     MAX_INT T2.Z, PV.Y, literal.y,
+; CM-NEXT:     MIN_INT * T0.W, PV.Y, literal.z,
+; CM-NEXT:    209715200(1.972152e-31), -330(nan)
 ; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
-; CM-NEXT:     MIN_INT T7.X, T0.Y, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, PV.W, literal.y,
-; CM-NEXT:     ADD_INT T4.Z, T1.X, literal.z,
-; CM-NEXT:     SETGT_UINT * T4.W, T1.X, literal.w,
-; CM-NEXT:    381(5.338947e-43), -254(nan)
+; CM-NEXT:     ADD_INT T5.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T2.Z, T3.Y, literal.z,
+; CM-NEXT:     SETGT_UINT * T0.W, T3.Y, literal.w,
+; CM-NEXT:    -254(nan), 204(2.858649e-43)
+; CM-NEXT:    102(1.429324e-43), -229(nan)
+; CM-NEXT:     ADD_INT T6.X, T3.Y, literal.x,
+; CM-NEXT:     SETGT_UINT T4.Y, T3.Y, literal.y,
+; CM-NEXT:     CNDE_INT T2.Z, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     SETGT_INT * T1.W, T3.Y, literal.x,
 ; CM-NEXT:    -127(nan), 254(3.559298e-43)
-; CM-NEXT:     CNDE_INT T8.X, PV.W, PV.Z, PV.Y,
-; CM-NEXT:     SETGT_INT T1.Y, T1.X, literal.x,
-; CM-NEXT:     ADD_INT T4.Z, PV.X, literal.y,
-; CM-NEXT:     ADD_INT * T5.W, T0.Y, literal.z,
+; CM-NEXT:     MUL_IEEE T7.X, T1.Y, literal.x,
+; CM-NEXT:     CNDE_INT T2.Y, PV.W, PV.Z, T3.Y,
+; CM-NEXT:     CNDE_INT T2.Z, PV.Y, PV.X, T5.X,
+; CM-NEXT:     MIN_INT * T2.W, T0.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 381(5.338947e-43)
+; CM-NEXT:     SETGT_INT T5.X, T3.Y, literal.x,
+; CM-NEXT:     ADD_INT T3.Y, PV.W, literal.y,
+; CM-NEXT:     ADD_INT T3.Z, T0.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T2.W, T0.Z, literal.w,
 ; CM-NEXT:    127(1.779649e-43), -254(nan)
-; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T1.X, T2.W, PV.W, PV.Z,
-; CM-NEXT:     CNDE_INT T5.Y, PV.Y, T3.Z, PV.X,
-; CM-NEXT:     CNDE_INT T3.Z, T6.X, T4.Y, T3.W,
-; CM-NEXT:     MUL_IEEE * T2.W, T2.X, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     CNDE_INT T6.X, PV.W, PV.Z, PV.Y,
+; CM-NEXT:     CNDE_INT T2.Y, PV.X, T2.Y, T2.Z,
+; CM-NEXT:     MUL_IEEE T2.Z, T7.X, literal.x,
+; CM-NEXT:     CNDE_INT * T0.W, T0.W, T0.Y, T1.Z, BS:VEC_021/SCL_122
 ; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_INT T6.X, T0.Y, literal.x,
-; CM-NEXT:     CNDE_INT T0.Y, T4.W, T2.X, PV.W,
-; CM-NEXT:     CNDE_INT * T1.Z, T2.Z, PV.Z, T1.Z,
-; CM-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; CM-NEXT:    ALU clause starting at 205:
-; CM-NEXT:     LSHL * T2.W, T5.Y, literal.x,
-; CM-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; CM-NEXT:     ADD_INT T2.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T0.Y, T1.Y, T1.Z, T0.Y,
-; CM-NEXT:     CNDE_INT * T1.Z, T6.X, T4.X, T1.X,
+; CM-NEXT:     SETGT_INT T8.X, T0.Z, literal.x,
+; CM-NEXT:     CNDE_INT T0.Y, T1.W, PV.W, T1.Y,
+; CM-NEXT:     CNDE_INT T0.Z, T4.Y, T7.X, PV.Z,
+; CM-NEXT:     LSHL * T0.W, PV.Y, literal.y,
+; CM-NEXT:    127(1.779649e-43), 23(3.222986e-44)
+; CM-NEXT:    ALU clause starting at 202:
+; CM-NEXT:     ADD_INT T7.X, T0.W, literal.x,
+; CM-NEXT:     CNDE_INT * T0.Y, T5.X, T0.Y, T0.Z,
 ; CM-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT * T1.W, T1.W, T3.Y, T2.Y,
-; CM-NEXT:     CNDE_INT T1.X, T0.Z, PV.W, T0.W,
-; CM-NEXT:     LSHL T1.Y, T1.Z, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     MUL_IEEE T0.Z, T0.Y, T2.X,
+; CM-NEXT:     CNDE_INT * T0.Z, T8.X, T0.X, T6.X,
+; CM-NEXT:     MUL_IEEE * T0.W, T4.X, literal.x,
+; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
+; CM-NEXT:     CNDE_INT T0.X, T2.W, T4.X, PV.W,
+; CM-NEXT:     LSHL T1.Y, T0.Z, literal.x,
+; CM-NEXT:     MUL_IEEE T0.Z, T0.Y, T7.X, BS:VEC_021/SCL_122
 ; CM-NEXT:     SETGT * T0.W, literal.y, KC0[4].X,
 ; CM-NEXT:    23(3.222986e-44), -1026650416(-1.032789e+02)
-; CM-NEXT:     CNDE T2.X, PV.W, PV.Z, 0.0,
+; CM-NEXT:     CNDE T4.X, PV.W, PV.Z, 0.0,
 ; CM-NEXT:     SETGT T0.Y, KC0[4].X, literal.x,
 ; CM-NEXT:     ADD_INT T0.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T6.X, PV.X, T5.X,
+; CM-NEXT:     CNDE_INT * T0.W, T8.X, T3.X, PV.X,
 ; CM-NEXT:    1118925336(8.872284e+01), 1065353216(1.000000e+00)
-; CM-NEXT:     SETGT T1.X, KC0[3].W, literal.x,
+; CM-NEXT:     SETGT T0.X, KC0[3].W, literal.x,
 ; CM-NEXT:     MUL_IEEE T1.Y, PV.W, PV.Z,
 ; CM-NEXT:     SETGT T0.Z, literal.y, KC0[3].Z,
 ; CM-NEXT:     CNDE * T0.W, PV.Y, PV.X, literal.z,
 ; CM-NEXT:    1118925336(8.872284e+01), -1026650416(-1.032789e+02)
 ; CM-NEXT:    2139095040(INF), 0(0.000000e+00)
-; CM-NEXT:     SETGT T2.X, literal.x, KC0[3].Y,
+; CM-NEXT:     SETGT T3.X, literal.x, KC0[3].Y,
 ; CM-NEXT:     CNDE T0.Y, PV.Z, PV.Y, 0.0,
-; CM-NEXT:     CNDE T0.Z, PV.X, T3.X, literal.y,
+; CM-NEXT:     CNDE T0.Z, PV.X, T2.X, literal.y,
 ; CM-NEXT:     SETGT * T1.W, KC0[3].Z, literal.z,
 ; CM-NEXT:    -1026650416(-1.032789e+02), 2139095040(INF)
 ; CM-NEXT:    1118925336(8.872284e+01), 0(0.000000e+00)
 ; CM-NEXT:     CNDE T0.Y, PV.W, PV.Y, literal.x,
-; CM-NEXT:     CNDE T1.Z, PV.X, T0.X, 0.0,
+; CM-NEXT:     CNDE T1.Z, PV.X, T1.X, 0.0,
 ; CM-NEXT:     SETGT * T1.W, KC0[3].Y, literal.y,
 ; CM-NEXT:    2139095040(INF), 1118925336(8.872284e+01)
 ; CM-NEXT:     CNDE * T0.X, PV.W, PV.Z, literal.x,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index 544c1de6c7bb..a16294958748 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -230,23 +230,23 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) {
 ; R600-NEXT:     MUL_IEEE * T2.W, PS, literal.z,
 ; R600-NEXT:    -127(nan), 254(3.559298e-43)
 ; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T3.X, T1.X, literal.x,
-; R600-NEXT:     MUL_IEEE T0.Y, PS, literal.y,
+; R600-NEXT:     MUL_IEEE T3.X, PS, literal.x,
+; R600-NEXT:     MUL_IEEE T0.Y, T1.X, literal.y,
 ; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Z, T0.Z,
 ; R600-NEXT:     CNDE_INT T3.W, PV.Y, PV.X, T0.X,
 ; R600-NEXT:     SETGT_INT * T4.W, T0.Z, literal.z,
-; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
+; R600-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
 ; R600-NEXT:    127(1.779649e-43), 0(0.000000e+00)
 ; R600-NEXT:     CNDE_INT T0.Z, PS, PV.Z, PV.W,
-; R600-NEXT:     CNDE_INT T0.W, T0.W, PV.Y, T2.W,
-; R600-NEXT:     MUL_IEEE * T2.W, PV.X, literal.x,
+; R600-NEXT:     MUL_IEEE T3.W, PV.Y, literal.x,
+; R600-NEXT:     CNDE_INT * T0.W, T0.W, PV.X, T2.W,
 ; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T1.Z, T1.Y, T3.X, PS,
-; R600-NEXT:     CNDE_INT T0.W, T1.W, PV.W, T1.X,
+; R600-NEXT:     CNDE_INT T1.Z, T1.W, PS, T1.X,
+; R600-NEXT:     CNDE_INT T0.W, T1.Y, T0.Y, PV.W,
 ; R600-NEXT:     LSHL * T1.W, PV.Z, literal.x,
 ; R600-NEXT:    23(3.222986e-44), 0(0.000000e+00)
 ; R600-NEXT:     ADD_INT T1.W, PS, literal.x,
-; R600-NEXT:     CNDE_INT * T0.W, T4.W, PV.W, PV.Z,
+; R600-NEXT:     CNDE_INT * T0.W, T4.W, PV.Z, PV.W,
 ; R600-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
 ; R600-NEXT:     MUL_IEEE T0.W, PS, PV.W,
 ; R600-NEXT:     SETGT * T1.W, literal.x, KC0[2].Z,
@@ -260,65 +260,63 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) {
 ;
 ; CM-LABEL: s_exp10_f32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 64, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 62, @4, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
 ; CM-NEXT:    ALU clause starting at 4:
 ; CM-NEXT:     AND_INT * T0.W, KC0[2].Z, literal.x,
 ; CM-NEXT:    -4096(nan), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.Z, PV.W, literal.x,
 ; CM-NEXT:     ADD * T1.W, KC0[2].Z, -PV.W,
-; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T1.Z, PV.W, literal.x,
-; CM-NEXT:     RNDNE * T2.W, PV.Z,
-; CM-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; CM-NEXT:     TRUNC T2.Z, PV.W,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.W, literal.x,
+; CM-NEXT:     MUL_IEEE * T2.W, T0.W, literal.y,
+; CM-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
+; CM-NEXT:     RNDNE T1.Z, PV.W,
 ; CM-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.x, PV.Z,
 ; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; CM-NEXT:     MULADD_IEEE T0.Y, T0.W, literal.x, PV.W,
-; CM-NEXT:     ADD T0.Z, T0.Z, -T2.W,
-; CM-NEXT:     FLT_TO_INT * T0.W, PV.Z,
+; CM-NEXT:     MULADD_IEEE T0.Z, T0.W, literal.x, PV.W,
+; CM-NEXT:     ADD * T0.W, T2.W, -PV.Z, BS:VEC_120/SCL_212
 ; CM-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; CM-NEXT:     MIN_INT T1.Z, PV.W, literal.x,
-; CM-NEXT:     ADD * T1.W, PV.Z, PV.Y,
+; CM-NEXT:     TRUNC T1.Z, T1.Z,
+; CM-NEXT:     ADD * T0.W, PV.W, PV.Z,
+; CM-NEXT:     EXP_IEEE T0.X, T0.W,
+; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
+; CM-NEXT:     FLT_TO_INT T0.Z, T1.Z,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T0.Y, PV.W, literal.x,
+; CM-NEXT:     MAX_INT T1.Z, PV.Z, literal.y,
+; CM-NEXT:     MIN_INT * T1.W, PV.Z, literal.z,
+; CM-NEXT:    209715200(1.972152e-31), -330(nan)
 ; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
-; CM-NEXT:     EXP_IEEE T0.X, T1.W,
-; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
-; CM-NEXT:     MUL_IEEE T0.Y, PV.X, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, T1.Z, literal.y,
-; CM-NEXT:     MAX_INT * T1.W, T0.W, literal.z,
-; CM-NEXT:    2130706432(1.701412e+38), -254(nan)
-; CM-NEXT:    -330(nan), 0(0.000000e+00)
-; CM-NEXT:     ADD_INT T1.X, T0.W, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, PV.W, literal.y,
-; CM-NEXT:     ADD_INT T1.Z, T0.W, literal.z,
-; CM-NEXT:     SETGT_UINT * T1.W, T0.W, literal.w,
-; CM-NEXT:    -127(nan), 204(2.858649e-43)
+; CM-NEXT:     ADD_INT T1.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T1.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T1.Z, T0.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T1.W, T0.Z, literal.w,
+; CM-NEXT:    -254(nan), 204(2.858649e-43)
 ; CM-NEXT:    102(1.429324e-43), -229(nan)
-; CM-NEXT:     SETGT_UINT T2.X, T0.W, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     SETGT_INT T1.Z, T0.W, literal.y,
-; CM-NEXT:     MUL_IEEE * T2.W, T0.X, literal.z,
-; CM-NEXT:    254(3.559298e-43), -127(nan)
-; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T3.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.Z, PV.Y, T0.W,
-; CM-NEXT:     CNDE_INT T0.Z, PV.X, T1.X, T0.Z,
-; CM-NEXT:     SETGT_INT * T0.W, T0.W, literal.y,
-; CM-NEXT:    209715200(1.972152e-31), 127(1.779649e-43)
+; CM-NEXT:     ADD_INT T2.X, T0.Z, literal.x,
+; CM-NEXT:     SETGT_UINT T2.Y, T0.Z, literal.y,
+; CM-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     SETGT_INT * T2.W, T0.Z, literal.x,
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     MUL_IEEE T3.X, T0.X, literal.x,
+; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Z, T0.Z,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Y, PV.X, T1.X,
+; CM-NEXT:     SETGT_INT * T3.W, T0.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 127(1.779649e-43)
 ; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     CNDE_INT T0.Z, T1.W, PV.X, T2.W,
-; CM-NEXT:     MUL_IEEE * T1.W, T0.Y, literal.x,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.X, literal.x,
+; CM-NEXT:     CNDE_INT * T0.W, T1.W, T0.Y, T0.W,
 ; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T0.Y, T2.X, T0.Y, PV.W,
-; CM-NEXT:     CNDE_INT T0.Z, T1.Z, PV.Z, T0.X,
-; CM-NEXT:     LSHL * T1.W, PV.Y, literal.x,
+; CM-NEXT:     CNDE_INT T0.Y, T2.W, PV.W, T0.X,
+; CM-NEXT:     CNDE_INT T0.Z, T2.Y, T3.X, PV.Z,
+; CM-NEXT:     LSHL * T0.W, PV.Y, literal.x,
 ; CM-NEXT:    23(3.222986e-44), 0(0.000000e+00)
 ; CM-NEXT:     ADD_INT T1.Z, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT * T0.W, T0.W, PV.Z, PV.Y,
+; CM-NEXT:     CNDE_INT * T0.W, T3.W, PV.Y, PV.Z,
 ; CM-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
 ; CM-NEXT:     MUL_IEEE T0.Z, PV.W, PV.Z,
 ; CM-NEXT:     SETGT * T0.W, literal.x, KC0[2].Z,
@@ -612,105 +610,105 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; R600-NEXT:     AND_INT * T0.W, KC0[3].X, literal.x,
 ; R600-NEXT:    -4096(nan), 0(0.000000e+00)
 ; R600-NEXT:     ADD * T1.W, KC0[3].X, -PV.W,
-; R600-NEXT:     AND_INT T0.Z, KC0[2].W, literal.x,
-; R600-NEXT:     MUL_IEEE T2.W, PV.W, literal.y,
-; R600-NEXT:     MUL_IEEE * T3.W, T0.W, literal.z,
-; R600-NEXT:    -4096(nan), 975668412(6.390323e-04)
-; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; R600-NEXT:     RNDNE T1.Z, PS,
+; R600-NEXT:     MUL_IEEE T2.W, PV.W, literal.x,
+; R600-NEXT:     MUL_IEEE * T3.W, T0.W, literal.y,
+; R600-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
+; R600-NEXT:     RNDNE T0.Z, PS,
 ; R600-NEXT:     MULADD_IEEE T1.W, T1.W, literal.x, PV.W,
-; R600-NEXT:     ADD * T2.W, KC0[2].W, -PV.Z,
-; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T0.Y, PS, literal.x,
-; R600-NEXT:     MUL_IEEE T2.Z, T0.Z, literal.y,
+; R600-NEXT:     AND_INT * T2.W, KC0[2].W, literal.y,
+; R600-NEXT:    1079283712(3.321289e+00), -4096(nan)
+; R600-NEXT:     ADD T1.Z, KC0[2].W, -PS,
 ; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.x, PV.W,
 ; R600-NEXT:     ADD * T1.W, T3.W, -PV.Z,
+; R600-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
+; R600-NEXT:     ADD T2.Z, PS, PV.W,
+; R600-NEXT:     MUL_IEEE T0.W, PV.Z, literal.x,
+; R600-NEXT:     MUL_IEEE * T1.W, T2.W, literal.y,
 ; R600-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
-; R600-NEXT:     ADD T3.Z, PS, PV.W,
-; R600-NEXT:     RNDNE T0.W, PV.Z,
-; R600-NEXT:     MULADD_IEEE * T1.W, T2.W, literal.x, PV.Y, BS:VEC_021/SCL_122
-; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; R600-NEXT:     TRUNC T0.Y, T1.Z,
-; R600-NEXT:     MULADD_IEEE T0.Z, T0.Z, literal.x, PS, BS:VEC_120/SCL_212
-; R600-NEXT:     ADD T1.W, T2.Z, -PV.W, BS:VEC_201
+; R600-NEXT:     RNDNE T0.Y, PS,
+; R600-NEXT:     MULADD_IEEE T1.Z, T1.Z, literal.x, PV.W,
+; R600-NEXT:     TRUNC T0.W, T0.Z, BS:VEC_120/SCL_212
 ; R600-NEXT:     EXP_IEEE * T0.X, PV.Z,
-; R600-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; R600-NEXT:     ADD T0.Z, PV.W, PV.Z,
-; R600-NEXT:     FLT_TO_INT T1.W, PV.Y,
-; R600-NEXT:     MUL_IEEE * T2.W, PS, literal.x,
-; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T1.Z, PS, literal.x,
-; R600-NEXT:     SETGT_UINT T3.W, PV.W, literal.y,
-; R600-NEXT:     EXP_IEEE * T0.Y, PV.Z,
-; R600-NEXT:    2130706432(1.701412e+38), 254(3.559298e-43)
-; R600-NEXT:     CNDE_INT T1.X, PV.W, T2.W, PV.Z,
-; R600-NEXT:     MUL_IEEE T1.Y, PS, literal.x,
-; R600-NEXT:     MAX_INT T0.Z, T1.W, literal.y,
-; R600-NEXT:     MIN_INT T2.W, T1.W, literal.z,
-; R600-NEXT:     TRUNC * T0.W, T0.W,
+; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
+; R600-NEXT:     FLT_TO_INT T1.Y, PV.W,
+; R600-NEXT:     MUL_IEEE T0.Z, PS, literal.x,
+; R600-NEXT:     MULADD_IEEE T0.W, T2.W, literal.y, PV.Z,
+; R600-NEXT:     ADD * T1.W, T1.W, -PV.Y,
+; R600-NEXT:    209715200(1.972152e-31), 975668412(6.390323e-04)
+; R600-NEXT:     ADD T1.Z, PS, PV.W,
+; R600-NEXT:     MUL_IEEE T0.W, PV.Z, literal.x,
+; R600-NEXT:     SETGT_UINT * T1.W, PV.Y, literal.y,
+; R600-NEXT:    209715200(1.972152e-31), -229(nan)
+; R600-NEXT:     CNDE_INT T0.Z, PS, PV.W, T0.Z,
+; R600-NEXT:     SETGT_INT T0.W, T1.Y, literal.x,
+; R600-NEXT:     EXP_IEEE * T1.X, PV.Z,
+; R600-NEXT:    -127(nan), 0(0.000000e+00)
+; R600-NEXT:     CNDE_INT T0.Z, PV.W, PV.Z, T0.X,
+; R600-NEXT:     MAX_INT T2.W, T1.Y, literal.x,
+; R600-NEXT:     MUL_IEEE * T3.W, PS, literal.y,
+; R600-NEXT:    -330(nan), 209715200(1.972152e-31)
+; R600-NEXT:     MUL_IEEE T2.X, PS, literal.x,
+; R600-NEXT:     ADD_INT T2.Y, PV.W, literal.y,
+; R600-NEXT:     ADD_INT T1.Z, T1.Y, literal.z,
+; R600-NEXT:     MIN_INT T2.W, T1.Y, literal.w,
+; R600-NEXT:     TRUNC * T4.W, T0.Y,
+; R600-NEXT:    209715200(1.972152e-31), 204(2.858649e-43)
+; R600-NEXT:    102(1.429324e-43), 381(5.338947e-43)
+; R600-NEXT:     FLT_TO_INT T3.X, PS,
+; R600-NEXT:     ADD_INT T0.Y, PV.W, literal.x,
+; R600-NEXT:     ADD_INT T2.Z, T1.Y, literal.y,
+; R600-NEXT:     SETGT_UINT T2.W, T1.Y, literal.z,
+; R600-NEXT:     CNDE_INT * T1.W, T1.W, PV.Y, PV.Z,
+; R600-NEXT:    -254(nan), -127(nan)
+; R600-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T4.X, T1.X, literal.x,
+; R600-NEXT:     MUL_IEEE T2.Y, T0.X, literal.x, BS:VEC_120/SCL_212
+; R600-NEXT:     CNDE_INT T1.Z, T0.W, PS, T1.Y,
+; R600-NEXT:     CNDE_INT T0.W, PV.W, PV.Z, PV.Y,
+; R600-NEXT:     MAX_INT * T1.W, PV.X, literal.y,
 ; R600-NEXT:    2130706432(1.701412e+38), -330(nan)
-; R600-NEXT:    381(5.338947e-43), 0(0.000000e+00)
-; R600-NEXT:     FLT_TO_INT T2.X, PS,
-; R600-NEXT:     ADD_INT T2.Y, PV.W, literal.x,
-; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
-; R600-NEXT:     ADD_INT T0.W, T1.W, literal.z,
-; R600-NEXT:     SETGT_UINT * T2.W, T1.W, literal.w,
-; R600-NEXT:    -254(nan), 204(2.858649e-43)
-; R600-NEXT:    102(1.429324e-43), -229(nan)
-; R600-NEXT:     ADD_INT T3.X, T1.W, literal.x,
-; R600-NEXT:     CNDE_INT T3.Y, PS, PV.Z, PV.W,
-; R600-NEXT:     SETGT_INT T0.Z, T1.W, literal.x,
-; R600-NEXT:     MUL_IEEE T0.W, T0.X, literal.y,
-; R600-NEXT:     MUL_IEEE * T4.W, T0.Y, literal.y,
-; R600-NEXT:    -127(nan), 209715200(1.972152e-31)
-; R600-NEXT:     MUL_IEEE T4.X, PS, literal.x,
-; R600-NEXT:     MUL_IEEE T4.Y, PV.W, literal.x,
-; R600-NEXT:     CNDE_INT T1.Z, PV.Z, PV.Y, T1.W,
-; R600-NEXT:     CNDE_INT T3.W, T3.W, PV.X, T2.Y,
-; R600-NEXT:     MAX_INT * T5.W, T2.X, literal.y,
-; R600-NEXT:    209715200(1.972152e-31), -330(nan)
-; R600-NEXT:     SETGT_INT T3.X, T1.W, literal.x,
-; R600-NEXT:     ADD_INT T2.Y, PS, literal.y,
-; R600-NEXT:     ADD_INT T2.Z, T2.X, literal.z,
-; R600-NEXT:     SETGT_UINT * T1.W, T2.X, literal.w,
+; R600-NEXT:     SETGT_INT T0.X, T1.Y, literal.x,
+; R600-NEXT:     ADD_INT T0.Y, PS, literal.y,
+; R600-NEXT:     ADD_INT T2.Z, T3.X, literal.z,
+; R600-NEXT:     SETGT_UINT * T1.W, T3.X, literal.w,
 ; R600-NEXT:    127(1.779649e-43), 204(2.858649e-43)
 ; R600-NEXT:    102(1.429324e-43), -229(nan)
-; R600-NEXT:     MIN_INT * T5.W, T2.X, literal.x,
+; R600-NEXT:     MIN_INT * T4.W, T3.X, literal.x,
 ; R600-NEXT:    381(5.338947e-43), 0(0.000000e+00)
 ; R600-NEXT:     ADD_INT T5.X, PV.W, literal.x,
-; R600-NEXT:     ADD_INT T3.Y, T2.X, literal.y,
-; R600-NEXT:     SETGT_UINT T3.Z, T2.X, literal.z,
-; R600-NEXT:     CNDE_INT T5.W, T1.W, T2.Y, T2.Z,
-; R600-NEXT:     SETGT_INT * T6.W, T2.X, literal.y,
+; R600-NEXT:     ADD_INT T1.Y, T3.X, literal.y,
+; R600-NEXT:     SETGT_UINT T3.Z, T3.X, literal.z,
+; R600-NEXT:     CNDE_INT T4.W, T1.W, T0.Y, T2.Z,
+; R600-NEXT:     SETGT_INT * T5.W, T3.X, literal.y,
 ; R600-NEXT:    -254(nan), -127(nan)
 ; R600-NEXT:    254(3.559298e-43), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T6.X, PS, PV.W, T2.X,
-; R600-NEXT:     CNDE_INT T2.Y, PV.Z, PV.Y, PV.X,
-; R600-NEXT:     SETGT_INT T2.Z, T2.X, literal.x, BS:VEC_120/SCL_212
-; R600-NEXT:     CNDE_INT T3.W, T3.X, T1.Z, T3.W, BS:VEC_021/SCL_122
-; R600-NEXT:     CNDE_INT * T0.W, T2.W, T4.Y, T0.W,
-; R600-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T0.X, T0.Z, PS, T0.X,
-; R600-NEXT:     LSHL T3.Y, PV.W, literal.x,
-; R600-NEXT:     CNDE_INT T0.Z, PV.Z, PV.X, PV.Y,
-; R600-NEXT:     CNDE_INT T0.W, T1.W, T4.X, T4.W,
-; R600-NEXT:     MUL_IEEE * T1.W, T1.Y, literal.y,
+; R600-NEXT:     CNDE_INT T6.X, PS, PV.W, T3.X,
+; R600-NEXT:     CNDE_INT T0.Y, PV.Z, PV.Y, PV.X,
+; R600-NEXT:     SETGT_INT T2.Z, T3.X, literal.x,
+; R600-NEXT:     CNDE_INT T0.W, T0.X, T1.Z, T0.W, BS:VEC_120/SCL_212
+; R600-NEXT:     MUL_IEEE * T4.W, T2.Y, literal.y,
+; R600-NEXT:    127(1.779649e-43), 2130706432(1.701412e+38)
+; R600-NEXT:     CNDE_INT T3.X, T2.W, T2.Y, PS, BS:VEC_120/SCL_212
+; R600-NEXT:     LSHL T1.Y, PV.W, literal.x,
+; R600-NEXT:     CNDE_INT T1.Z, PV.Z, PV.X, PV.Y,
+; R600-NEXT:     MUL_IEEE T0.W, T4.X, literal.y,
+; R600-NEXT:     CNDE_INT * T1.W, T1.W, T2.X, T3.W,
 ; R600-NEXT:    23(3.222986e-44), 2130706432(1.701412e+38)
-; R600-NEXT:     CNDE_INT T2.X, T3.Z, T1.Y, PS,
-; R600-NEXT:     CNDE_INT T0.Y, T6.W, PV.W, T0.Y,
-; R600-NEXT:     LSHL T0.Z, PV.Z, literal.x,
+; R600-NEXT:     CNDE_INT T1.X, T5.W, PS, T1.X, BS:VEC_021/SCL_122
+; R600-NEXT:     CNDE_INT T0.Y, T3.Z, T4.X, PV.W, BS:VEC_201
+; R600-NEXT:     LSHL T1.Z, PV.Z, literal.x,
 ; R600-NEXT:     ADD_INT T0.W, PV.Y, literal.y,
-; R600-NEXT:     CNDE_INT * T1.W, T3.X, PV.X, T1.X,
+; R600-NEXT:     CNDE_INT * T1.W, T0.X, T0.Z, PV.X,
 ; R600-NEXT:    23(3.222986e-44), 1065353216(1.000000e+00)
 ; R600-NEXT:     MUL_IEEE T1.Y, PS, PV.W,
-; R600-NEXT:     SETGT T1.Z, literal.x, KC0[3].X,
+; R600-NEXT:     SETGT T0.Z, literal.x, KC0[3].X,
 ; R600-NEXT:     ADD_INT * T0.W, PV.Z, literal.y,
 ; R600-NEXT:    -1036817932(-4.485347e+01), 1065353216(1.000000e+00)
 ; R600-NEXT:    ALU clause starting at 101:
-; R600-NEXT:     CNDE_INT * T1.W, T2.Z, T0.Y, T2.X,
+; R600-NEXT:     CNDE_INT * T1.W, T2.Z, T1.X, T0.Y,
 ; R600-NEXT:     MUL_IEEE T0.Y, PV.W, T0.W,
-; R600-NEXT:     SETGT T0.Z, literal.x, KC0[2].W,
-; R600-NEXT:     CNDE T0.W, T1.Z, T1.Y, 0.0,
+; R600-NEXT:     SETGT T1.Z, literal.x, KC0[2].W,
+; R600-NEXT:     CNDE T0.W, T0.Z, T1.Y, 0.0,
 ; R600-NEXT:     SETGT * T1.W, KC0[3].X, literal.y,
 ; R600-NEXT:    -1036817932(-4.485347e+01), 1109008539(3.853184e+01)
 ; R600-NEXT:     CNDE T1.Y, PS, PV.W, literal.x,
@@ -723,118 +721,116 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ;
 ; CM-LABEL: s_exp10_v2f32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 100, @4, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    ALU 18, @105, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 98, @4, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 18, @103, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    ALU clause starting at 4:
 ; CM-NEXT:     AND_INT * T0.W, KC0[2].W, literal.x,
 ; CM-NEXT:    -4096(nan), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.Z, PV.W, literal.x,
 ; CM-NEXT:     ADD * T1.W, KC0[2].W, -PV.W,
+; CM-NEXT:     MUL_IEEE T0.Y, PV.W, literal.x,
+; CM-NEXT:     MUL_IEEE T0.Z, T0.W, literal.y,
+; CM-NEXT:     AND_INT * T2.W, KC0[3].X, literal.z,
+; CM-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
+; CM-NEXT:    -4096(nan), 0(0.000000e+00)
+; CM-NEXT:     ADD T1.Y, KC0[3].X, -PV.W,
+; CM-NEXT:     RNDNE T1.Z, PV.Z,
+; CM-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.x, PV.Y,
 ; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T1.Z, PV.W, literal.x,
-; CM-NEXT:     RNDNE * T2.W, PV.Z,
-; CM-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; CM-NEXT:     TRUNC T0.Y, PV.W,
-; CM-NEXT:     AND_INT T2.Z, KC0[3].X, literal.x,
-; CM-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.y, PV.Z,
-; CM-NEXT:    -4096(nan), 1079283712(3.321289e+00)
 ; CM-NEXT:     MULADD_IEEE T0.X, T0.W, literal.x, PV.W,
-; CM-NEXT:     MUL_IEEE T1.Y, PV.Z, literal.y,
-; CM-NEXT:     FLT_TO_INT T1.Z, PV.Y,
-; CM-NEXT:     ADD * T0.W, KC0[3].X, -PV.Z,
+; CM-NEXT:     ADD T0.Y, T0.Z, -PV.Z,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.x,
+; CM-NEXT:     MUL_IEEE * T0.W, T2.W, literal.y, BS:VEC_120/SCL_212
 ; CM-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
-; CM-NEXT:     ADD T1.X, T0.Z, -T2.W,
-; CM-NEXT:     MUL_IEEE T0.Y, PV.W, literal.x,
-; CM-NEXT:     MAX_INT T0.Z, PV.Z, literal.y,
-; CM-NEXT:     RNDNE * T1.W, PV.Y,
-; CM-NEXT:    975668412(6.390323e-04), -330(nan)
-; CM-NEXT:     TRUNC T2.X, PV.W,
-; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.x,
-; CM-NEXT:     MULADD_IEEE T0.Z, T0.W, literal.y, PV.Y,
-; CM-NEXT:     ADD * T0.W, PV.X, T0.X,
-; CM-NEXT:    204(2.858649e-43), 1079283712(3.321289e+00)
-; CM-NEXT:     EXP_IEEE T0.X, T0.W,
-; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
-; CM-NEXT:     ADD_INT T1.X, T1.Z, literal.x,
-; CM-NEXT:     MULADD_IEEE T0.Y, T2.Z, literal.y, T0.Z, BS:VEC_102/SCL_221
-; CM-NEXT:     ADD T0.Z, T1.Y, -T1.W,
-; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.z,
-; CM-NEXT:    102(1.429324e-43), 975668412(6.390323e-04)
-; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_UINT T3.X, T1.Z, literal.x,
-; CM-NEXT:     MUL_IEEE T1.Y, PV.W, literal.y,
-; CM-NEXT:     SETGT_UINT T2.Z, T1.Z, literal.z,
-; CM-NEXT:     ADD * T1.W, PV.Z, PV.Y,
-; CM-NEXT:    -229(nan), 2130706432(1.701412e+38)
-; CM-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; CM-NEXT:     TRUNC T1.X, T1.Z,
+; CM-NEXT:     RNDNE T2.Y, PV.W,
+; CM-NEXT:     MULADD_IEEE T0.Z, T1.Y, literal.x, PV.Z,
+; CM-NEXT:     ADD * T1.W, PV.Y, PV.X,
+; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
+; CM-NEXT:     EXP_IEEE T0.X, T1.W,
+; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T1.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
+; CM-NEXT:     MULADD_IEEE T2.X, T2.W, literal.x, T0.Z,
+; CM-NEXT:     ADD T0.Y, T0.W, -T2.Y, BS:VEC_120/SCL_212
+; CM-NEXT:     FLT_TO_INT T0.Z, T1.X,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.y,
+; CM-NEXT:    975668412(6.390323e-04), 209715200(1.972152e-31)
+; CM-NEXT:     MUL_IEEE T1.X, PV.W, literal.x,
+; CM-NEXT:     SETGT_UINT T1.Y, PV.Z, literal.y,
+; CM-NEXT:     TRUNC T1.Z, T2.Y,
+; CM-NEXT:     ADD * T1.W, PV.Y, PV.X,
+; CM-NEXT:    209715200(1.972152e-31), -229(nan)
 ; CM-NEXT:     EXP_IEEE T0.X (MASKED), T1.W,
 ; CM-NEXT:     EXP_IEEE T0.Y, T1.W,
 ; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
 ; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
-; CM-NEXT:     CNDE_INT T4.X, T2.Z, T0.W, T1.Y,
-; CM-NEXT:     CNDE_INT T1.Y, T3.X, T2.Y, T1.X,
-; CM-NEXT:     FLT_TO_INT T0.Z, T2.X, BS:VEC_120/SCL_212
-; CM-NEXT:     MUL_IEEE * T0.W, PV.Y, literal.x,
-; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_INT T1.X, T1.Z, literal.x,
-; CM-NEXT:     MUL_IEEE T2.Y, T0.X, literal.y,
-; CM-NEXT:     MUL_IEEE T3.Z, PV.W, literal.z,
-; CM-NEXT:     SETGT_UINT * T1.W, PV.Z, literal.w,
-; CM-NEXT:    -127(nan), 209715200(1.972152e-31)
-; CM-NEXT:    2130706432(1.701412e+38), 254(3.559298e-43)
-; CM-NEXT:     CNDE_INT T2.X, PV.W, T0.W, PV.Z,
+; CM-NEXT:     FLT_TO_INT T2.X, T1.Z,
+; CM-NEXT:     MUL_IEEE T2.Y, PV.Y, literal.x,
+; CM-NEXT:     CNDE_INT T1.Z, T1.Y, T1.X, T0.W,
+; CM-NEXT:     SETGT_INT * T0.W, T0.Z, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:    209715200(1.972152e-31), -127(nan)
+; CM-NEXT:     CNDE_INT T1.X, PV.W, PV.Z, T0.X,
 ; CM-NEXT:     MUL_IEEE T3.Y, PV.Y, literal.x,
-; CM-NEXT:     CNDE_INT T3.Z, PV.X, T1.Y, T1.Z,
-; CM-NEXT:     MAX_INT * T0.W, T0.Z, literal.y,
-; CM-NEXT:    209715200(1.972152e-31), -330(nan)
-; CM-NEXT:     ADD_INT T5.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, T0.Z, literal.y,
-; CM-NEXT:     SETGT_UINT T4.Z, T0.Z, literal.z,
-; CM-NEXT:     MUL_IEEE * T0.W, T0.Y, literal.w,
+; CM-NEXT:     SETGT_UINT T1.Z, PV.X, literal.y,
+; CM-NEXT:     MAX_INT * T1.W, T0.Z, literal.z,
+; CM-NEXT:    209715200(1.972152e-31), -229(nan)
+; CM-NEXT:    -330(nan), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T3.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T4.Y, T0.Z, literal.y,
+; CM-NEXT:     CNDE_INT T2.Z, PV.Z, PV.Y, T2.Y,
+; CM-NEXT:     SETGT_INT * T1.W, T2.X, literal.z,
 ; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
-; CM-NEXT:    -229(nan), 209715200(1.972152e-31)
-; CM-NEXT:     MUL_IEEE T6.X, PV.W, literal.x,
-; CM-NEXT:     MIN_INT T4.Y, T0.Z, literal.y,
-; CM-NEXT:     CNDE_INT T5.Z, PV.Z, PV.X, PV.Y,
-; CM-NEXT:     SETGT_INT * T2.W, T0.Z, literal.z,
-; CM-NEXT:    209715200(1.972152e-31), 381(5.338947e-43)
-; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T5.X, PV.W, PV.Z, T0.Z,
-; CM-NEXT:     MIN_INT T1.Y, T1.Z, literal.x,
-; CM-NEXT:     ADD_INT T5.Z, PV.Y, literal.y,
-; CM-NEXT:     ADD_INT * T3.W, T0.Z, literal.z, BS:VEC_120/SCL_212
-; CM-NEXT:    381(5.338947e-43), -254(nan)
 ; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T7.X, T1.W, PV.W, PV.Z,
-; CM-NEXT:     SETGT_INT T4.Y, T0.Z, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, PV.Y, literal.y,
-; CM-NEXT:     ADD_INT * T1.W, T1.Z, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:     CNDE_INT T4.X, PV.W, PV.Z, T0.Y,
+; CM-NEXT:     MUL_IEEE T2.Y, T0.X, literal.x,
+; CM-NEXT:     MAX_INT T2.Z, T2.X, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     CNDE_INT * T2.W, T1.Y, PV.X, PV.Y,
+; CM-NEXT:    2130706432(1.701412e+38), -330(nan)
+; CM-NEXT:     CNDE_INT T0.X, T0.W, PV.W, T0.Z,
+; CM-NEXT:     ADD_INT T1.Y, PV.Z, literal.x,
+; CM-NEXT:     ADD_INT T2.Z, T2.X, literal.y,
+; CM-NEXT:     MIN_INT * T0.W, T2.X, literal.z,
+; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T3.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T3.Y, T2.X, literal.y,
+; CM-NEXT:     SETGT_UINT T3.Z, T2.X, literal.z,
+; CM-NEXT:     CNDE_INT * T0.W, T1.Z, PV.Y, PV.Z,
+; CM-NEXT:    -254(nan), -127(nan)
+; CM-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T5.X, T0.Y, literal.x,
+; CM-NEXT:     CNDE_INT T0.Y, T1.W, PV.W, T2.X,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Z, PV.Y, PV.X,
+; CM-NEXT:     MIN_INT * T0.W, T0.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 381(5.338947e-43)
+; CM-NEXT:     SETGT_INT T2.X, T2.X, literal.x,
+; CM-NEXT:     ADD_INT T1.Y, PV.W, literal.y,
+; CM-NEXT:     ADD_INT T2.Z, T0.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T0.W, T0.Z, literal.w,
 ; CM-NEXT:    127(1.779649e-43), -254(nan)
-; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T8.X, T2.Z, PV.W, PV.Z,
-; CM-NEXT:     SETGT_INT T1.Y, T1.Z, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T5.X, PV.X,
-; CM-NEXT:     CNDE_INT * T0.W, T4.Z, T6.X, T0.W, BS:VEC_201
-; CM-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T5.X, T2.W, PV.W, T0.Y,
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     CNDE_INT T3.X, PV.W, PV.Z, PV.Y,
+; CM-NEXT:     SETGT_INT T1.Y, T0.Z, literal.x,
+; CM-NEXT:     CNDE_INT T0.Z, PV.X, T0.Y, T1.Z,
+; CM-NEXT:     MUL_IEEE * T1.W, T5.X, literal.y,
+; CM-NEXT:    127(1.779649e-43), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T5.X, T3.Z, T5.X, PV.W,
 ; CM-NEXT:     LSHL T0.Y, PV.Z, literal.x,
-; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T3.Z, PV.X,
-; CM-NEXT:     CNDE_INT * T0.W, T3.X, T3.Y, T2.Y, BS:VEC_201
-; CM-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T0.X, T1.X, PV.W, T0.X,
+; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T0.X, PV.X, BS:VEC_021/SCL_122
+; CM-NEXT:     MUL_IEEE * T1.W, T2.Y, literal.y,
+; CM-NEXT:    23(3.222986e-44), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T0.X, T0.W, T2.Y, PV.W,
 ; CM-NEXT:     LSHL T2.Y, PV.Z, literal.x,
 ; CM-NEXT:     ADD_INT * T0.Z, PV.Y, literal.y,
 ; CM-NEXT:    23(3.222986e-44), 1065353216(1.000000e+00)
-; CM-NEXT:    ALU clause starting at 105:
-; CM-NEXT:     CNDE_INT * T0.W, T4.Y, T5.X, T2.X,
-; CM-NEXT:     MUL_IEEE T1.X, PV.W, T0.Z,
+; CM-NEXT:    ALU clause starting at 103:
+; CM-NEXT:     CNDE_INT * T0.W, T2.X, T4.X, T5.X,
+; CM-NEXT:     MUL_IEEE T2.X, PV.W, T0.Z,
 ; CM-NEXT:     SETGT T0.Y, literal.x, KC0[3].X,
 ; CM-NEXT:     ADD_INT T0.Z, T2.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T1.Y, T0.X, T4.X, BS:VEC_120/SCL_212
+; CM-NEXT:     CNDE_INT * T0.W, T1.Y, T1.X, T0.X, BS:VEC_120/SCL_212
 ; CM-NEXT:    -1036817932(-4.485347e+01), 1065353216(1.000000e+00)
 ; CM-NEXT:     MUL_IEEE T0.X, PV.W, PV.Z,
 ; CM-NEXT:     SETGT T1.Y, literal.x, KC0[2].W,
@@ -1217,8 +1213,8 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ;
 ; R600-LABEL: s_exp10_v3f32:
 ; R600:       ; %bb.0:
-; R600-NEXT:    ALU 100, @6, KC0[CB0:0-32], KC1[]
-; R600-NEXT:    ALU 69, @107, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    ALU 99, @6, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    ALU 69, @106, KC0[CB0:0-32], KC1[]
 ; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
 ; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
 ; R600-NEXT:    CF_END
@@ -1226,69 +1222,68 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; R600-NEXT:    ALU clause starting at 6:
 ; R600-NEXT:     AND_INT * T0.W, KC0[3].Y, literal.x,
 ; R600-NEXT:    -4096(nan), 0(0.000000e+00)
-; R600-NEXT:     ADD T1.W, KC0[3].Y, -PV.W,
-; R600-NEXT:     MUL_IEEE * T2.W, PV.W, literal.x,
+; R600-NEXT:     MUL_IEEE T1.W, PV.W, literal.x,
+; R600-NEXT:     ADD * T2.W, KC0[3].Y, -PV.W,
 ; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; R600-NEXT:     RNDNE T3.W, PS,
-; R600-NEXT:     MUL_IEEE * T4.W, PV.W, literal.x,
+; R600-NEXT:     RNDNE * T3.W, PV.W,
+; R600-NEXT:     TRUNC T4.W, PV.W,
+; R600-NEXT:     MUL_IEEE * T5.W, T2.W, literal.x,
 ; R600-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; R600-NEXT:     MULADD_IEEE T1.W, T1.W, literal.x, PS,
-; R600-NEXT:     TRUNC * T4.W, PV.W,
+; R600-NEXT:     MULADD_IEEE T2.W, T2.W, literal.x, PS,
+; R600-NEXT:     FLT_TO_INT * T4.W, PV.W,
 ; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; R600-NEXT:     FLT_TO_INT T0.Z, PS,
-; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.x, PV.W,
-; R600-NEXT:     ADD * T1.W, T2.W, -T3.W,
-; R600-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; R600-NEXT:     ADD T0.W, PS, PV.W,
-; R600-NEXT:     MAX_INT * T1.W, PV.Z, literal.x,
-; R600-NEXT:    -330(nan), 0(0.000000e+00)
-; R600-NEXT:     ADD_INT T0.Y, PS, literal.x,
-; R600-NEXT:     ADD_INT T1.Z, T0.Z, literal.y,
-; R600-NEXT:     SETGT_UINT T1.W, T0.Z, literal.z,
-; R600-NEXT:     EXP_IEEE * T0.X, PV.W,
+; R600-NEXT:     MAX_INT T0.Z, PS, literal.x,
+; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.y, PV.W,
+; R600-NEXT:     ADD * T1.W, T1.W, -T3.W,
+; R600-NEXT:    -330(nan), 975668412(6.390323e-04)
+; R600-NEXT:     ADD T0.Y, PS, PV.W,
+; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.x,
+; R600-NEXT:     ADD_INT T0.W, T4.W, literal.y,
+; R600-NEXT:     SETGT_UINT * T1.W, T4.W, literal.z,
 ; R600-NEXT:    204(2.858649e-43), 102(1.429324e-43)
 ; R600-NEXT:    -229(nan), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
-; R600-NEXT:     SETGT_INT T0.W, T0.Z, literal.x,
-; R600-NEXT:     MUL_IEEE * T2.W, PS, literal.y,
-; R600-NEXT:    -127(nan), 209715200(1.972152e-31)
-; R600-NEXT:     MUL_IEEE T0.Y, PS, literal.x,
-; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Z, T0.Z,
-; R600-NEXT:     MIN_INT T3.W, T0.Z, literal.y,
-; R600-NEXT:     AND_INT * T4.W, KC0[3].W, literal.z,
-; R600-NEXT:    209715200(1.972152e-31), 381(5.338947e-43)
-; R600-NEXT:    -4096(nan), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T1.X, T0.X, literal.x,
-; R600-NEXT:     ADD T1.Y, KC0[3].W, -PS,
-; R600-NEXT:     ADD_INT T2.Z, PV.W, literal.y,
-; R600-NEXT:     ADD_INT T3.W, T0.Z, literal.z,
-; R600-NEXT:     SETGT_UINT * T5.W, T0.Z, literal.w,
-; R600-NEXT:    2130706432(1.701412e+38), -254(nan)
+; R600-NEXT:     CNDE_INT T0.Z, PS, PV.Z, PV.W,
+; R600-NEXT:     SETGT_INT T0.W, T4.W, literal.x,
+; R600-NEXT:     EXP_IEEE * T0.X, PV.Y,
+; R600-NEXT:    -127(nan), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T1.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T0.Y, PV.W, PV.Z, T4.W,
+; R600-NEXT:     MIN_INT T0.Z, T4.W, literal.y,
+; R600-NEXT:     AND_INT T2.W, KC0[3].W, literal.z,
+; R600-NEXT:     MUL_IEEE * T3.W, PS, literal.w,
+; R600-NEXT:    2130706432(1.701412e+38), 381(5.338947e-43)
+; R600-NEXT:    -4096(nan), 209715200(1.972152e-31)
+; R600-NEXT:     MUL_IEEE T2.X, PS, literal.x,
+; R600-NEXT:     ADD T1.Y, KC0[3].W, -PV.W,
+; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
+; R600-NEXT:     ADD_INT T5.W, T4.W, literal.z,
+; R600-NEXT:     SETGT_UINT * T6.W, T4.W, literal.w,
+; R600-NEXT:    209715200(1.972152e-31), -254(nan)
 ; R600-NEXT:    -127(nan), 254(3.559298e-43)
-; R600-NEXT:     CNDE_INT T2.X, PS, PV.W, PV.Z,
-; R600-NEXT:     SETGT_INT T2.Y, T0.Z, literal.x,
+; R600-NEXT:     CNDE_INT T3.X, PS, PV.W, PV.Z,
+; R600-NEXT:     SETGT_INT T2.Y, T4.W, literal.x,
 ; R600-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.y,
-; R600-NEXT:     MUL_IEEE T3.W, T4.W, literal.z,
-; R600-NEXT:     MUL_IEEE * T6.W, PV.X, literal.w,
+; R600-NEXT:     MUL_IEEE * T4.W, T2.W, literal.z, BS:VEC_120/SCL_212
 ; R600-NEXT:    127(1.779649e-43), 975668412(6.390323e-04)
-; R600-NEXT:    1079283712(3.321289e+00), 2130706432(1.701412e+38)
-; R600-NEXT:     CNDE_INT T1.X, T5.W, T1.X, PS, BS:VEC_120/SCL_212
-; R600-NEXT:     RNDNE T3.Y, PV.W,
-; R600-NEXT:     MULADD_IEEE T0.Z, T1.Y, literal.x, PV.Z,
-; R600-NEXT:     CNDE_INT T5.W, PV.Y, T1.Z, PV.X,
-; R600-NEXT:     CNDE_INT * T1.W, T1.W, T0.Y, T2.W,
 ; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T0.X, T0.W, PS, T0.X,
+; R600-NEXT:     CNDE_INT * T1.W, T1.W, T2.X, T3.W,
+; R600-NEXT:     CNDE_INT T0.X, T0.W, PV.W, T0.X, BS:VEC_021/SCL_122
+; R600-NEXT:     RNDNE T3.Y, T4.W, BS:VEC_120/SCL_212
+; R600-NEXT:     MULADD_IEEE T0.Z, T1.Y, literal.x, T0.Z,
+; R600-NEXT:     CNDE_INT T0.W, T2.Y, T0.Y, T3.X, BS:VEC_120/SCL_212
+; R600-NEXT:     MUL_IEEE * T1.W, T1.X, literal.y,
+; R600-NEXT:    1079283712(3.321289e+00), 2130706432(1.701412e+38)
+; R600-NEXT:     CNDE_INT T1.X, T6.W, T1.X, PS,
 ; R600-NEXT:     LSHL T0.Y, PV.W, literal.x,
 ; R600-NEXT:     AND_INT T1.Z, KC0[3].Z, literal.y,
-; R600-NEXT:     MULADD_IEEE T0.W, T4.W, literal.z, PV.Z, BS:VEC_120/SCL_212
-; R600-NEXT:     ADD * T1.W, T3.W, -PV.Y,
+; R600-NEXT:     MULADD_IEEE T0.W, T2.W, literal.z, PV.Z, BS:VEC_120/SCL_212
+; R600-NEXT:     ADD * T1.W, T4.W, -PV.Y,
 ; R600-NEXT:    23(3.222986e-44), -4096(nan)
 ; R600-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
 ; R600-NEXT:     ADD T1.Y, PS, PV.W,
 ; R600-NEXT:     MUL_IEEE T0.Z, PV.Z, literal.x,
 ; R600-NEXT:     ADD_INT T0.W, PV.Y, literal.y,
-; R600-NEXT:     CNDE_INT * T1.W, T2.Y, PV.X, T1.X,
+; R600-NEXT:     CNDE_INT * T1.W, T2.Y, T0.X, PV.X,
 ; R600-NEXT:    1079283712(3.321289e+00), 1065353216(1.000000e+00)
 ; R600-NEXT:     MUL_IEEE T0.X, PS, PV.W,
 ; R600-NEXT:     ADD T0.Y, KC0[3].Z, -T1.Z,
@@ -1302,12 +1297,12 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; R600-NEXT:     MUL_IEEE * T1.W, PS, literal.z,
 ; R600-NEXT:    -1036817932(-4.485347e+01), 975668412(6.390323e-04)
 ; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T3.X, T1.X, literal.x,
-; R600-NEXT:     MUL_IEEE T2.Y, PS, literal.y,
+; R600-NEXT:     MUL_IEEE T3.X, PS, literal.x,
+; R600-NEXT:     MUL_IEEE T2.Y, T1.X, literal.y,
 ; R600-NEXT:     MULADD_IEEE T4.Z, T0.Y, literal.z, PV.W,
 ; R600-NEXT:     FLT_TO_INT T0.W, PV.Z,
 ; R600-NEXT:     MIN_INT * T2.W, PV.Y, literal.w,
-; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
+; R600-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
 ; R600-NEXT:    1079283712(3.321289e+00), 381(5.338947e-43)
 ; R600-NEXT:     ADD_INT T4.X, PS, literal.x,
 ; R600-NEXT:     MAX_INT T0.Y, PV.W, literal.y,
@@ -1325,7 +1320,7 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; R600-NEXT:    102(1.429324e-43), -229(nan)
 ; R600-NEXT:     ADD_INT * T6.X, T0.W, literal.x,
 ; R600-NEXT:    -127(nan), 0(0.000000e+00)
-; R600-NEXT:    ALU clause starting at 107:
+; R600-NEXT:    ALU clause starting at 106:
 ; R600-NEXT:     SETGT_UINT T0.Y, T0.W, literal.x,
 ; R600-NEXT:     CNDE_INT T0.Z, T3.W, T0.Z, T2.W, BS:VEC_102/SCL_221
 ; R600-NEXT:     SETGT_INT T2.W, T0.W, literal.y,
@@ -1341,25 +1336,25 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; R600-NEXT:     SETGT_UINT T5.X, T1.Y, literal.x,
 ; R600-NEXT:     CNDE_INT T4.Y, PS, PV.Z, PV.W,
 ; R600-NEXT:     MAX_INT T0.Z, T1.Y, literal.y,
-; R600-NEXT:     MUL_IEEE T4.W, T1.Z, literal.z,
-; R600-NEXT:     MUL_IEEE * T5.W, PV.Y, literal.w,
+; R600-NEXT:     MUL_IEEE T4.W, PV.Y, literal.z,
+; R600-NEXT:     MUL_IEEE * T5.W, T1.Z, literal.w,
 ; R600-NEXT:    254(3.559298e-43), -330(nan)
-; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
-; R600-NEXT:     CNDE_INT T6.X, T3.W, PS, T3.Y, BS:VEC_021/SCL_122
-; R600-NEXT:     MUL_IEEE T3.Y, PV.W, literal.x,
+; R600-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
+; R600-NEXT:     MUL_IEEE T6.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T3.Y, T3.W, PV.W, T3.Y, BS:VEC_021/SCL_122
 ; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
 ; R600-NEXT:     ADD_INT T3.W, T1.Y, literal.z,
-; R600-NEXT:     SETGT_UINT * T5.W, T1.Y, literal.w,
+; R600-NEXT:     SETGT_UINT * T4.W, T1.Y, literal.w,
 ; R600-NEXT:    2130706432(1.701412e+38), 204(2.858649e-43)
 ; R600-NEXT:    102(1.429324e-43), -229(nan)
 ; R600-NEXT:     CNDE_INT T8.X, PS, PV.Z, PV.W,
 ; R600-NEXT:     SETGT_INT T5.Y, T1.Y, literal.x,
-; R600-NEXT:     CNDE_INT T0.Z, T0.Y, T4.W, PV.Y, BS:VEC_120/SCL_212
-; R600-NEXT:     CNDE_INT T2.W, T2.W, PV.X, T1.Z,
+; R600-NEXT:     CNDE_INT T0.Z, T2.W, PV.Y, T1.Z,
+; R600-NEXT:     CNDE_INT T2.W, T0.Y, T5.W, PV.X, BS:VEC_120/SCL_212
 ; R600-NEXT:     LSHL * T3.W, T4.Y, literal.y,
 ; R600-NEXT:    -127(nan), 23(3.222986e-44)
 ; R600-NEXT:     ADD_INT T6.X, PS, literal.x,
-; R600-NEXT:     CNDE_INT T0.Y, T0.W, PV.W, PV.Z,
+; R600-NEXT:     CNDE_INT T0.Y, T0.W, PV.Z, PV.W,
 ; R600-NEXT:     CNDE_INT T0.Z, PV.Y, PV.X, T1.Y,
 ; R600-NEXT:     CNDE_INT T0.W, T5.X, T7.X, T4.X,
 ; R600-NEXT:     SETGT_INT * T2.W, T1.Y, literal.y,
@@ -1367,18 +1362,18 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; R600-NEXT:     CNDE_INT T4.X, PS, PV.Z, PV.W,
 ; R600-NEXT:     MUL_IEEE T0.Y, PV.Y, PV.X,
 ; R600-NEXT:     SETGT T0.Z, literal.x, KC0[3].Z,
-; R600-NEXT:     CNDE_INT T0.W, T5.W, T2.Y, T1.W,
-; R600-NEXT:     MUL_IEEE * T1.W, T3.X, literal.y,
+; R600-NEXT:     MUL_IEEE T0.W, T2.Y, literal.y,
+; R600-NEXT:     CNDE_INT * T1.W, T4.W, T3.X, T1.W,
 ; R600-NEXT:    -1036817932(-4.485347e+01), 2130706432(1.701412e+38)
-; R600-NEXT:     CNDE_INT T3.X, T5.X, T3.X, PS,
-; R600-NEXT:     CNDE_INT T1.Y, T5.Y, PV.W, T1.X,
+; R600-NEXT:     CNDE_INT T1.X, T5.Y, PS, T1.X,
+; R600-NEXT:     CNDE_INT T1.Y, T5.X, T2.Y, PV.W,
 ; R600-NEXT:     CNDE T0.Z, PV.Z, PV.Y, 0.0,
 ; R600-NEXT:     SETGT T0.W, KC0[3].Z, literal.x,
 ; R600-NEXT:     LSHL * T1.W, PV.X, literal.y,
 ; R600-NEXT:    1109008539(3.853184e+01), 23(3.222986e-44)
-; R600-NEXT:     ADD_INT T1.X, PS, literal.x,
+; R600-NEXT:     ADD_INT T3.X, PS, literal.x,
 ; R600-NEXT:     CNDE T0.Y, PV.W, PV.Z, literal.y,
-; R600-NEXT:     CNDE_INT T0.Z, T2.W, PV.Y, PV.X,
+; R600-NEXT:     CNDE_INT T0.Z, T2.W, PV.X, PV.Y,
 ; R600-NEXT:     CNDE T0.W, T2.X, T0.X, 0.0,
 ; R600-NEXT:     SETGT * T1.W, KC0[3].Y, literal.z,
 ; R600-NEXT:    1065353216(1.000000e+00), 2139095040(INF)
@@ -1399,197 +1394,193 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ;
 ; CM-LABEL: s_exp10_v3f32:
 ; CM:       ; %bb.0:
-; CM-NEXT:    ALU 102, @6, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    ALU 80, @109, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
-; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T0.X
+; CM-NEXT:    ALU 101, @6, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 77, @108, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
+; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T3.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
 ; CM-NEXT:    ALU clause starting at 6:
 ; CM-NEXT:     AND_INT * T0.W, KC0[3].Y, literal.x,
 ; CM-NEXT:    -4096(nan), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.Z, PV.W, literal.x,
 ; CM-NEXT:     ADD * T1.W, KC0[3].Y, -PV.W,
-; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T1.Z, PV.W, literal.x,
-; CM-NEXT:     RNDNE * T2.W, PV.Z,
-; CM-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; CM-NEXT:     TRUNC T2.Z, PV.W,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.W, literal.x,
+; CM-NEXT:     MUL_IEEE * T2.W, T0.W, literal.y,
+; CM-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
+; CM-NEXT:     RNDNE T1.Z, PV.W,
 ; CM-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.x, PV.Z,
 ; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; CM-NEXT:     MULADD_IEEE T0.Y, T0.W, literal.x, PV.W,
-; CM-NEXT:     ADD T0.Z, T0.Z, -T2.W,
-; CM-NEXT:     FLT_TO_INT * T0.W, PV.Z,
+; CM-NEXT:     MULADD_IEEE T0.Z, T0.W, literal.x, PV.W,
+; CM-NEXT:     ADD * T0.W, T2.W, -PV.Z, BS:VEC_120/SCL_212
 ; CM-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; CM-NEXT:     MIN_INT T1.Z, PV.W, literal.x,
-; CM-NEXT:     ADD * T1.W, PV.Z, PV.Y,
+; CM-NEXT:     TRUNC T1.Z, T1.Z,
+; CM-NEXT:     ADD * T0.W, PV.W, PV.Z,
+; CM-NEXT:     EXP_IEEE T0.X, T0.W,
+; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
+; CM-NEXT:     FLT_TO_INT T0.Z, T1.Z,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T0.Y, PV.W, literal.x,
+; CM-NEXT:     MAX_INT T1.Z, PV.Z, literal.y,
+; CM-NEXT:     MIN_INT * T1.W, PV.Z, literal.z,
+; CM-NEXT:    209715200(1.972152e-31), -330(nan)
 ; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
-; CM-NEXT:     EXP_IEEE T0.X, T1.W,
-; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
-; CM-NEXT:     MUL_IEEE T0.Y, PV.X, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, T1.Z, literal.y,
-; CM-NEXT:     MAX_INT * T1.W, T0.W, literal.z,
-; CM-NEXT:    2130706432(1.701412e+38), -254(nan)
-; CM-NEXT:    -330(nan), 0(0.000000e+00)
-; CM-NEXT:     ADD_INT T1.X, T0.W, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, PV.W, literal.y,
-; CM-NEXT:     ADD_INT T1.Z, T0.W, literal.z,
-; CM-NEXT:     SETGT_UINT * T1.W, T0.W, literal.w,
-; CM-NEXT:    -127(nan), 204(2.858649e-43)
+; CM-NEXT:     ADD_INT T1.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T1.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T1.Z, T0.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T1.W, T0.Z, literal.w,
+; CM-NEXT:    -254(nan), 204(2.858649e-43)
 ; CM-NEXT:    102(1.429324e-43), -229(nan)
-; CM-NEXT:     SETGT_UINT T2.X, T0.W, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     SETGT_INT T1.Z, T0.W, literal.y,
-; CM-NEXT:     MUL_IEEE * T2.W, T0.X, literal.z,
-; CM-NEXT:    254(3.559298e-43), -127(nan)
-; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T3.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.Z, PV.Y, T0.W,
-; CM-NEXT:     CNDE_INT T0.Z, PV.X, T1.X, T0.Z,
-; CM-NEXT:     SETGT_INT * T0.W, T0.W, literal.y,
-; CM-NEXT:    209715200(1.972152e-31), 127(1.779649e-43)
+; CM-NEXT:     ADD_INT T2.X, T0.Z, literal.x,
+; CM-NEXT:     SETGT_UINT T2.Y, T0.Z, literal.y,
+; CM-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     SETGT_INT * T2.W, T0.Z, literal.x,
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     MUL_IEEE T3.X, T0.X, literal.x,
+; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Z, T0.Z,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Y, PV.X, T1.X,
+; CM-NEXT:     SETGT_INT * T3.W, T0.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 127(1.779649e-43)
 ; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     CNDE_INT T0.Z, T1.W, PV.X, T2.W,
-; CM-NEXT:     MUL_IEEE * T1.W, T0.Y, literal.x,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.X, literal.x,
+; CM-NEXT:     CNDE_INT * T0.W, T1.W, T0.Y, T0.W,
 ; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T1.X, T2.X, T0.Y, PV.W,
-; CM-NEXT:     CNDE_INT T0.Y, T1.Z, PV.Z, T0.X,
+; CM-NEXT:     CNDE_INT T0.X, T2.W, PV.W, T0.X,
+; CM-NEXT:     CNDE_INT T0.Y, T2.Y, T3.X, PV.Z,
 ; CM-NEXT:     LSHL T0.Z, PV.Y, literal.x,
-; CM-NEXT:     AND_INT * T1.W, KC0[3].Z, literal.y,
+; CM-NEXT:     AND_INT * T0.W, KC0[3].Z, literal.y,
 ; CM-NEXT:    23(3.222986e-44), -4096(nan)
-; CM-NEXT:     MUL_IEEE T0.X, PV.W, literal.x,
 ; CM-NEXT:     ADD T1.Y, KC0[3].Z, -PV.W,
-; CM-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T0.W, PV.Y, PV.X,
-; CM-NEXT:    1079283712(3.321289e+00), 1065353216(1.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.Y, PV.W, PV.Z,
-; CM-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.x,
-; CM-NEXT:     RNDNE * T0.W, PV.X,
-; CM-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T0.Z, PV.Z, literal.x,
+; CM-NEXT:     CNDE_INT * T1.W, T3.W, PV.X, PV.Y,
+; CM-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T0.X, PV.W, PV.Z,
+; CM-NEXT:     MUL_IEEE T0.Y, PV.Y, literal.x,
+; CM-NEXT:     MUL_IEEE T0.Z, T0.W, literal.y,
+; CM-NEXT:     AND_INT * T1.W, KC0[3].W, literal.z,
+; CM-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
+; CM-NEXT:    -4096(nan), 0(0.000000e+00)
 ; CM-NEXT:     SETGT T1.X, literal.x, KC0[3].Y,
-; CM-NEXT:     TRUNC T2.Y, PV.W,
-; CM-NEXT:     AND_INT T1.Z, KC0[3].W, literal.y,
-; CM-NEXT:     MULADD_IEEE * T2.W, T1.Y, literal.z, PV.Z,
-; CM-NEXT:    -1036817932(-4.485347e+01), -4096(nan)
-; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; CM-NEXT:     MULADD_IEEE T2.X, T1.W, literal.x, PV.W,
-; CM-NEXT:     MUL_IEEE T1.Y, PV.Z, literal.y,
-; CM-NEXT:     FLT_TO_INT T0.Z, PV.Y,
-; CM-NEXT:     ADD * T1.W, KC0[3].W, -PV.Z,
+; CM-NEXT:     ADD T2.Y, KC0[3].W, -PV.W,
+; CM-NEXT:     RNDNE T1.Z, PV.Z,
+; CM-NEXT:     MULADD_IEEE * T2.W, T1.Y, literal.y, PV.Y,
+; CM-NEXT:    -1036817932(-4.485347e+01), 1079283712(3.321289e+00)
+; CM-NEXT:     MULADD_IEEE T2.X, T0.W, literal.x, PV.W,
+; CM-NEXT:     ADD T0.Y, T0.Z, -PV.Z,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.x,
+; CM-NEXT:     MUL_IEEE * T0.W, T1.W, literal.y, BS:VEC_120/SCL_212
 ; CM-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
-; CM-NEXT:     ADD T0.X, T0.X, -T0.W,
-; CM-NEXT:     MUL_IEEE T2.Y, PV.W, literal.x,
-; CM-NEXT:     MAX_INT T2.Z, PV.Z, literal.y,
-; CM-NEXT:     RNDNE * T0.W, PV.Y,
-; CM-NEXT:    975668412(6.390323e-04), -330(nan)
-; CM-NEXT:     TRUNC T3.X, PV.W,
-; CM-NEXT:     ADD_INT T3.Y, PV.Z, literal.x,
-; CM-NEXT:     MULADD_IEEE T2.Z, T1.W, literal.y, PV.Y,
-; CM-NEXT:     ADD * T1.W, PV.X, T2.X,
-; CM-NEXT:    204(2.858649e-43), 1079283712(3.321289e+00)
-; CM-NEXT:     EXP_IEEE T0.X, T1.W,
-; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
-; CM-NEXT:     ADD_INT T2.X, T0.Z, literal.x,
-; CM-NEXT:     MULADD_IEEE T2.Y, T1.Z, literal.y, T2.Z, BS:VEC_102/SCL_221
-; CM-NEXT:     ADD T1.Z, T1.Y, -T0.W,
-; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.z,
-; CM-NEXT:    102(1.429324e-43), 975668412(6.390323e-04)
-; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_UINT T4.X, T0.Z, literal.x,
-; CM-NEXT:     MUL_IEEE T1.Y, PV.W, literal.y,
-; CM-NEXT:     SETGT_UINT T2.Z, T0.Z, literal.z,
-; CM-NEXT:     ADD * T1.W, PV.Z, PV.Y,
-; CM-NEXT:    -229(nan), 2130706432(1.701412e+38)
-; CM-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; CM-NEXT:     TRUNC T3.X, T1.Z,
+; CM-NEXT:     RNDNE T1.Y, PV.W,
+; CM-NEXT:     MULADD_IEEE T0.Z, T2.Y, literal.x, PV.Z,
+; CM-NEXT:     ADD * T2.W, PV.Y, PV.X,
+; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
+; CM-NEXT:     EXP_IEEE T0.X (MASKED), T2.W,
+; CM-NEXT:     EXP_IEEE T0.Y, T2.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T2.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T2.W,
+; CM-NEXT:     MULADD_IEEE T2.X, T1.W, literal.x, T0.Z,
+; CM-NEXT:     ADD T2.Y, T0.W, -T1.Y, BS:VEC_120/SCL_212
+; CM-NEXT:     FLT_TO_INT T0.Z, T3.X,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.Y, literal.y,
+; CM-NEXT:    975668412(6.390323e-04), 209715200(1.972152e-31)
+; CM-NEXT:     MUL_IEEE T3.X, PV.W, literal.x,
+; CM-NEXT:     SETGT_UINT T3.Y, PV.Z, literal.y,
+; CM-NEXT:     TRUNC T1.Z, T1.Y,
+; CM-NEXT:     ADD * T1.W, PV.Y, PV.X,
+; CM-NEXT:    209715200(1.972152e-31), -229(nan)
 ; CM-NEXT:     EXP_IEEE T1.X (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE T1.Y (MASKED), T1.W,
-; CM-NEXT:     EXP_IEEE T1.Z, T1.W,
+; CM-NEXT:     EXP_IEEE T1.Y, T1.W,
+; CM-NEXT:     EXP_IEEE T1.Z (MASKED), T1.W,
 ; CM-NEXT:     EXP_IEEE * T1.W (MASKED), T1.W,
-; CM-NEXT:    ALU clause starting at 109:
-; CM-NEXT:     CNDE_INT T5.X, T2.Z, T0.W, T1.Y,
-; CM-NEXT:     CNDE_INT T1.Y, T4.X, T3.Y, T2.X,
-; CM-NEXT:     FLT_TO_INT T3.Z, T3.X, BS:VEC_120/SCL_212
-; CM-NEXT:     MUL_IEEE * T0.W, T1.Z, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_INT T2.X, T0.Z, literal.x,
-; CM-NEXT:     MUL_IEEE T2.Y, T0.X, literal.y,
-; CM-NEXT:     MUL_IEEE T4.Z, PV.W, literal.z,
-; CM-NEXT:     SETGT_UINT * T1.W, PV.Z, literal.w,
-; CM-NEXT:    -127(nan), 209715200(1.972152e-31)
-; CM-NEXT:    2130706432(1.701412e+38), 254(3.559298e-43)
-; CM-NEXT:     CNDE_INT T3.X, PV.W, T0.W, PV.Z,
-; CM-NEXT:     MUL_IEEE T3.Y, PV.Y, literal.x,
-; CM-NEXT:     CNDE_INT T4.Z, PV.X, T1.Y, T0.Z,
-; CM-NEXT:     MAX_INT * T0.W, T3.Z, literal.y,
-; CM-NEXT:    209715200(1.972152e-31), -330(nan)
-; CM-NEXT:     ADD_INT T6.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, T3.Z, literal.y,
-; CM-NEXT:     SETGT_UINT T5.Z, T3.Z, literal.z,
-; CM-NEXT:     MUL_IEEE * T0.W, T1.Z, literal.w, BS:VEC_120/SCL_212
+; CM-NEXT:     FLT_TO_INT T2.X, T1.Z,
+; CM-NEXT:     MUL_IEEE T2.Y, PV.Y, literal.x,
+; CM-NEXT:     CNDE_INT T1.Z, T3.Y, T3.X, T0.W,
+; CM-NEXT:     SETGT_INT * T0.W, T0.Z, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:    209715200(1.972152e-31), -127(nan)
+; CM-NEXT:     CNDE_INT T3.X, PV.W, PV.Z, T0.Y,
+; CM-NEXT:     MUL_IEEE * T4.Y, PV.Y, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:    ALU clause starting at 108:
+; CM-NEXT:     SETGT_UINT T1.Z, T2.X, literal.x,
+; CM-NEXT:     MAX_INT * T1.W, T0.Z, literal.y,
+; CM-NEXT:    -229(nan), -330(nan)
+; CM-NEXT:     ADD_INT T4.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T5.Y, T0.Z, literal.y,
+; CM-NEXT:     CNDE_INT T2.Z, PV.Z, T4.Y, T2.Y,
+; CM-NEXT:     SETGT_INT * T1.W, T2.X, literal.z,
 ; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
-; CM-NEXT:    -229(nan), 209715200(1.972152e-31)
-; CM-NEXT:     MUL_IEEE T7.X, PV.W, literal.x,
-; CM-NEXT:     MIN_INT T4.Y, T3.Z, literal.y,
-; CM-NEXT:     CNDE_INT T6.Z, PV.Z, PV.X, PV.Y,
-; CM-NEXT:     SETGT_INT * T2.W, T3.Z, literal.z,
-; CM-NEXT:    209715200(1.972152e-31), 381(5.338947e-43)
 ; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T6.X, PV.W, PV.Z, T3.Z,
-; CM-NEXT:     MIN_INT T1.Y, T0.Z, literal.x,
-; CM-NEXT:     ADD_INT T6.Z, PV.Y, literal.y,
-; CM-NEXT:     ADD_INT * T3.W, T3.Z, literal.z, BS:VEC_120/SCL_212
-; CM-NEXT:    381(5.338947e-43), -254(nan)
-; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T8.X, T1.W, PV.W, PV.Z,
-; CM-NEXT:     SETGT_INT T4.Y, T3.Z, literal.x,
-; CM-NEXT:     ADD_INT T3.Z, PV.Y, literal.y,
-; CM-NEXT:     ADD_INT * T1.W, T0.Z, literal.z, BS:VEC_120/SCL_212
+; CM-NEXT:     CNDE_INT T5.X, PV.W, PV.Z, T1.Y,
+; CM-NEXT:     MUL_IEEE T0.Y, T0.Y, literal.x,
+; CM-NEXT:     MAX_INT T2.Z, T2.X, literal.y,
+; CM-NEXT:     CNDE_INT * T2.W, T3.Y, PV.X, PV.Y, BS:VEC_120/SCL_212
+; CM-NEXT:    2130706432(1.701412e+38), -330(nan)
+; CM-NEXT:     CNDE_INT T4.X, T0.W, PV.W, T0.Z,
+; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.x,
+; CM-NEXT:     ADD_INT T2.Z, T2.X, literal.y,
+; CM-NEXT:     MIN_INT * T0.W, T2.X, literal.z,
+; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T6.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T3.Y, T2.X, literal.y,
+; CM-NEXT:     SETGT_UINT T3.Z, T2.X, literal.z,
+; CM-NEXT:     CNDE_INT * T0.W, T1.Z, PV.Y, PV.Z,
+; CM-NEXT:    -254(nan), -127(nan)
+; CM-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T7.X, T1.Y, literal.x,
+; CM-NEXT:     CNDE_INT T1.Y, T1.W, PV.W, T2.X,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Z, PV.Y, PV.X,
+; CM-NEXT:     MIN_INT * T0.W, T0.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 381(5.338947e-43)
+; CM-NEXT:     SETGT_INT T2.X, T2.X, literal.x,
+; CM-NEXT:     ADD_INT T2.Y, PV.W, literal.y,
+; CM-NEXT:     ADD_INT T2.Z, T0.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T0.W, T0.Z, literal.w,
 ; CM-NEXT:    127(1.779649e-43), -254(nan)
-; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T9.X, T2.Z, PV.W, PV.Z,
-; CM-NEXT:     SETGT_INT T1.Y, T0.Z, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T6.X, PV.X,
-; CM-NEXT:     CNDE_INT * T0.W, T5.Z, T7.X, T0.W, BS:VEC_201
-; CM-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T6.X, T2.W, PV.W, T1.Z,
-; CM-NEXT:     LSHL T5.Y, PV.Z, literal.x,
-; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T4.Z, PV.X,
-; CM-NEXT:     CNDE_INT * T0.W, T4.X, T3.Y, T2.Y,
-; CM-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T0.X, T2.X, PV.W, T0.X,
-; CM-NEXT:     LSHL T2.Y, PV.Z, literal.x,
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     CNDE_INT T6.X, PV.W, PV.Z, PV.Y,
+; CM-NEXT:     SETGT_INT T2.Y, T0.Z, literal.x,
+; CM-NEXT:     CNDE_INT T0.Z, PV.X, T1.Y, T1.Z,
+; CM-NEXT:     MUL_IEEE * T1.W, T7.X, literal.y,
+; CM-NEXT:    127(1.779649e-43), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T7.X, T3.Z, T7.X, PV.W,
+; CM-NEXT:     LSHL T1.Y, PV.Z, literal.x,
+; CM-NEXT:     CNDE_INT T0.Z, PV.Y, T4.X, PV.X, BS:VEC_021/SCL_122
+; CM-NEXT:     MUL_IEEE * T1.W, T0.Y, literal.y,
+; CM-NEXT:    23(3.222986e-44), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T4.X, T0.W, T0.Y, PV.W,
+; CM-NEXT:     LSHL T0.Y, PV.Z, literal.x,
 ; CM-NEXT:     ADD_INT T0.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T4.Y, PV.X, T3.X, BS:VEC_021/SCL_122
+; CM-NEXT:     CNDE_INT * T0.W, T2.X, T5.X, PV.X,
 ; CM-NEXT:    23(3.222986e-44), 1065353216(1.000000e+00)
 ; CM-NEXT:     MUL_IEEE T2.X, PV.W, PV.Z,
-; CM-NEXT:     SETGT T3.Y, literal.x, KC0[3].W,
+; CM-NEXT:     SETGT T1.Y, literal.x, KC0[3].W,
 ; CM-NEXT:     ADD_INT T0.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T1.Y, PV.X, T5.X,
+; CM-NEXT:     CNDE_INT * T0.W, T2.Y, T3.X, PV.X,
 ; CM-NEXT:    -1036817932(-4.485347e+01), 1065353216(1.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.X, PV.W, PV.Z,
-; CM-NEXT:     SETGT T1.Y, literal.x, KC0[3].Z,
+; CM-NEXT:     MUL_IEEE T3.X, PV.W, PV.Z,
+; CM-NEXT:     SETGT T0.Y, literal.x, KC0[3].Z,
 ; CM-NEXT:     CNDE T0.Z, PV.Y, PV.X, 0.0,
 ; CM-NEXT:     SETGT * T0.W, KC0[3].W, literal.y,
 ; CM-NEXT:    -1036817932(-4.485347e+01), 1109008539(3.853184e+01)
 ; CM-NEXT:     CNDE T2.X, PV.W, PV.Z, literal.x,
-; CM-NEXT:     CNDE T1.Y, PV.Y, PV.X, 0.0,
+; CM-NEXT:     CNDE T0.Y, PV.Y, PV.X, 0.0,
 ; CM-NEXT:     SETGT T0.Z, KC0[3].Z, literal.y,
 ; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.z,
 ; CM-NEXT:    2139095040(INF), 1109008539(3.853184e+01)
 ; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
-; CM-NEXT:     LSHR T0.X, PV.W, literal.x,
-; CM-NEXT:     CNDE T1.Y, PV.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE T0.Z, T1.X, T0.Y, 0.0,
+; CM-NEXT:     LSHR T3.X, PV.W, literal.x,
+; CM-NEXT:     CNDE T0.Y, PV.Z, PV.Y, literal.y,
+; CM-NEXT:     CNDE T0.Z, T1.X, T0.X, 0.0,
 ; CM-NEXT:     SETGT * T0.W, KC0[3].Y, literal.z,
 ; CM-NEXT:    2(2.802597e-45), 2139095040(INF)
 ; CM-NEXT:    1109008539(3.853184e+01), 0(0.000000e+00)
-; CM-NEXT:     CNDE * T1.X, PV.W, PV.Z, literal.x,
+; CM-NEXT:     CNDE * T0.X, PV.W, PV.Z, literal.x,
 ; CM-NEXT:    2139095040(INF), 0(0.000000e+00)
-; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
+; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %result = call <3 x float> @llvm.exp10.v3f32(<3 x float> %in)
   store <3 x float> %result, ptr addrspace(1) %out
@@ -2052,227 +2043,224 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; R600-LABEL: s_exp10_v4f32:
 ; R600:       ; %bb.0:
 ; R600-NEXT:    ALU 98, @6, KC0[CB0:0-32], KC1[]
-; R600-NEXT:    ALU 98, @105, KC0[CB0:0-32], KC1[]
-; R600-NEXT:    ALU 24, @204, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    ALU 95, @105, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    ALU 24, @201, KC0[CB0:0-32], KC1[]
 ; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1
 ; R600-NEXT:    CF_END
 ; R600-NEXT:    PAD
 ; R600-NEXT:    ALU clause starting at 6:
 ; R600-NEXT:     AND_INT * T0.W, KC0[3].Z, literal.x,
 ; R600-NEXT:    -4096(nan), 0(0.000000e+00)
-; R600-NEXT:     ADD T1.W, KC0[3].Z, -PV.W,
-; R600-NEXT:     MUL_IEEE * T2.W, PV.W, literal.x,
+; R600-NEXT:     ADD * T1.W, KC0[3].Z, -PV.W,
+; R600-NEXT:     MUL_IEEE T2.W, PV.W, literal.x,
+; R600-NEXT:     MUL_IEEE * T3.W, T0.W, literal.y,
+; R600-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
+; R600-NEXT:     RNDNE T4.W, PS,
+; R600-NEXT:     MULADD_IEEE * T1.W, T1.W, literal.x, PV.W, BS:VEC_021/SCL_122
 ; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; R600-NEXT:     RNDNE T3.W, PS,
-; R600-NEXT:     MUL_IEEE * T4.W, PV.W, literal.x,
+; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.x, PS,
+; R600-NEXT:     ADD * T1.W, T3.W, -PV.W,
 ; R600-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; R600-NEXT:     MULADD_IEEE T1.W, T1.W, literal.x, PS,
-; R600-NEXT:     TRUNC * T4.W, PV.W,
-; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; R600-NEXT:     FLT_TO_INT T0.Z, PS,
-; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.x, PV.W,
-; R600-NEXT:     ADD * T1.W, T2.W, -T3.W,
-; R600-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; R600-NEXT:     ADD T1.Z, PS, PV.W,
-; R600-NEXT:     MAX_INT T0.W, PV.Z, literal.x,
-; R600-NEXT:     MIN_INT * T1.W, PV.Z, literal.y,
-; R600-NEXT:    -330(nan), 381(5.338947e-43)
-; R600-NEXT:     ADD_INT T0.X, PS, literal.x,
-; R600-NEXT:     ADD_INT T0.Y, PV.W, literal.y,
-; R600-NEXT:     ADD_INT T2.Z, T0.Z, literal.z,
-; R600-NEXT:     SETGT_UINT T0.W, T0.Z, literal.w,
-; R600-NEXT:     EXP_IEEE * T1.X, PV.Z,
-; R600-NEXT:    -254(nan), 204(2.858649e-43)
-; R600-NEXT:    102(1.429324e-43), -229(nan)
-; R600-NEXT:     ADD_INT T2.X, T0.Z, literal.x,
-; R600-NEXT:     SETGT_UINT T1.Y, T0.Z, literal.y,
-; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
-; R600-NEXT:     SETGT_INT T1.W, T0.Z, literal.x,
-; R600-NEXT:     MUL_IEEE * T2.W, PS, literal.z,
-; R600-NEXT:    -127(nan), 254(3.559298e-43)
-; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T3.X, T1.X, literal.x,
-; R600-NEXT:     MUL_IEEE T0.Y, PS, literal.y,
-; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Z, T0.Z,
-; R600-NEXT:     CNDE_INT T3.W, PV.Y, PV.X, T0.X,
-; R600-NEXT:     SETGT_INT * T4.W, T0.Z, literal.z,
-; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
-; R600-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; R600-NEXT:     AND_INT T2.Y, KC0[4].X, literal.x,
-; R600-NEXT:     CNDE_INT T0.Z, PS, PV.Z, PV.W,
-; R600-NEXT:     CNDE_INT T0.W, T0.W, PV.Y, T2.W,
-; R600-NEXT:     MUL_IEEE * T2.W, PV.X, literal.y,
-; R600-NEXT:    -4096(nan), 2130706432(1.701412e+38)
-; R600-NEXT:     CNDE_INT T0.X, T1.Y, T3.X, PS,
-; R600-NEXT:     CNDE_INT T0.Y, T1.W, PV.W, T1.X,
-; R600-NEXT:     LSHL T0.Z, PV.Z, literal.x,
-; R600-NEXT:     ADD T0.W, KC0[4].X, -PV.Y,
-; R600-NEXT:     MUL_IEEE * T1.W, PV.Y, literal.y,
-; R600-NEXT:    23(3.222986e-44), 1079283712(3.321289e+00)
-; R600-NEXT:     RNDNE T1.Y, PS,
-; R600-NEXT:     MUL_IEEE T1.Z, PV.W, literal.x,
-; R600-NEXT:     ADD_INT T2.W, PV.Z, literal.y,
-; R600-NEXT:     CNDE_INT * T3.W, T4.W, PV.Y, PV.X,
-; R600-NEXT:    975668412(6.390323e-04), 1065353216(1.000000e+00)
-; R600-NEXT:     MUL_IEEE T0.Y, PS, PV.W,
-; R600-NEXT:     AND_INT T0.Z, KC0[3].W, literal.x,
-; R600-NEXT:     MULADD_IEEE T0.W, T0.W, literal.y, PV.Z,
-; R600-NEXT:     TRUNC * T2.W, PV.Y,
-; R600-NEXT:    -4096(nan), 1079283712(3.321289e+00)
-; R600-NEXT:     SETGT T0.X, literal.x, KC0[3].Z,
-; R600-NEXT:     FLT_TO_INT T3.Y, PS,
-; R600-NEXT:     MULADD_IEEE T1.Z, T2.Y, literal.y, PV.W,
-; R600-NEXT:     ADD T0.W, T1.W, -T1.Y,
-; R600-NEXT:     MUL_IEEE * T1.W, PV.Z, literal.z,
-; R600-NEXT:    -1036817932(-4.485347e+01), 975668412(6.390323e-04)
-; R600-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
-; R600-NEXT:     RNDNE T1.X, PS,
-; R600-NEXT:     AND_INT T1.Y, KC0[3].Y, literal.x,
-; R600-NEXT:     ADD T1.Z, PV.W, PV.Z,
-; R600-NEXT:     MAX_INT T0.W, PV.Y, literal.y,
-; R600-NEXT:     MIN_INT * T2.W, PV.Y, literal.z,
-; R600-NEXT:    -4096(nan), -330(nan)
+; R600-NEXT:     ADD T0.W, PS, PV.W,
+; R600-NEXT:     TRUNC * T1.W, T4.W,
+; R600-NEXT:     FLT_TO_INT T1.W, PS,
+; R600-NEXT:     EXP_IEEE * T0.X, PV.W,
+; R600-NEXT:     MUL_IEEE T0.Z, PS, literal.x,
+; R600-NEXT:     MAX_INT T0.W, PV.W, literal.y,
+; R600-NEXT:     MIN_INT * T2.W, PV.W, literal.z,
+; R600-NEXT:    209715200(1.972152e-31), -330(nan)
 ; R600-NEXT:    381(5.338947e-43), 0(0.000000e+00)
-; R600-NEXT:     ADD_INT T2.X, PS, literal.x,
-; R600-NEXT:     ADD_INT T2.Y, PV.W, literal.y,
-; R600-NEXT:     ADD_INT T2.Z, T3.Y, literal.z,
-; R600-NEXT:     SETGT_UINT T0.W, T3.Y, literal.w,
-; R600-NEXT:     EXP_IEEE * T1.Z, PV.Z,
-; R600-NEXT:    -254(nan), 204(2.858649e-43)
-; R600-NEXT:    102(1.429324e-43), -229(nan)
-; R600-NEXT:     ADD_INT T3.X, T3.Y, literal.x,
-; R600-NEXT:     SETGT_UINT T4.Y, T3.Y, literal.y,
-; R600-NEXT:     CNDE_INT T2.Z, PV.W, PV.Y, PV.Z,
-; R600-NEXT:     SETGT_INT T2.W, T3.Y, literal.x,
-; R600-NEXT:     MUL_IEEE * T3.W, PS, literal.z,
+; R600-NEXT:     ADD_INT T1.X, PS, literal.x,
+; R600-NEXT:     AND_INT T0.Y, KC0[4].X, literal.y,
+; R600-NEXT:     ADD_INT T1.Z, PV.W, literal.z,
+; R600-NEXT:     ADD_INT * T0.W, T1.W, literal.w,
+; R600-NEXT:    -254(nan), -4096(nan)
+; R600-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; R600-NEXT:     SETGT_UINT * T2.W, T1.W, literal.x,
+; R600-NEXT:    -229(nan), 0(0.000000e+00)
+; R600-NEXT:     ADD_INT T2.X, T1.W, literal.x,
+; R600-NEXT:     SETGT_UINT T1.Y, T1.W, literal.y,
+; R600-NEXT:     CNDE_INT T1.Z, PV.W, T1.Z, T0.W,
+; R600-NEXT:     SETGT_INT T0.W, T1.W, literal.x,
+; R600-NEXT:     ADD * T3.W, KC0[4].X, -T0.Y,
 ; R600-NEXT:    -127(nan), 254(3.559298e-43)
-; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T4.X, T1.Z, literal.x,
-; R600-NEXT:     MUL_IEEE T2.Y, PS, literal.y,
-; R600-NEXT:     CNDE_INT T2.Z, PV.W, PV.Z, T3.Y,
-; R600-NEXT:     CNDE_INT T4.W, PV.Y, PV.X, T2.X,
-; R600-NEXT:     SETGT_INT * T5.W, T3.Y, literal.z,
-; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
+; R600-NEXT:     MUL_IEEE T3.X, PS, literal.x,
+; R600-NEXT:     MUL_IEEE T2.Y, T0.Y, literal.y,
+; R600-NEXT:     CNDE_INT T1.Z, PV.W, PV.Z, T1.W,
+; R600-NEXT:     CNDE_INT T4.W, PV.Y, PV.X, T1.X,
+; R600-NEXT:     SETGT_INT * T1.W, T1.W, literal.z,
+; R600-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
 ; R600-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; R600-NEXT:     ADD T2.X, KC0[3].W, -T0.Z,
-; R600-NEXT:     CNDE_INT T3.Y, PS, PV.Z, PV.W,
-; R600-NEXT:     CNDE_INT * T2.Z, T0.W, PV.Y, T3.W,
-; R600-NEXT:    ALU clause starting at 105:
-; R600-NEXT:     MUL_IEEE T0.W, T4.X, literal.x,
-; R600-NEXT:     ADD * T3.W, KC0[3].Y, -T1.Y,
+; R600-NEXT:     CNDE_INT T1.X, PS, PV.Z, PV.W,
+; R600-NEXT:     RNDNE T3.Y, PV.Y,
+; R600-NEXT:     MULADD_IEEE T1.Z, T3.W, literal.x, PV.X,
+; R600-NEXT:     MUL_IEEE T3.W, T0.Z, literal.y,
+; R600-NEXT:     MUL_IEEE * T4.W, T0.X, literal.z,
+; R600-NEXT:    1079283712(3.321289e+00), 209715200(1.972152e-31)
 ; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T2.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T4.Y, T2.W, PV.W, T0.Z,
+; R600-NEXT:     MULADD_IEEE T0.Z, T0.Y, literal.y, PV.Z,
+; R600-NEXT:     ADD T2.W, T2.Y, -PV.Y, BS:VEC_120/SCL_212
+; R600-NEXT:     AND_INT * T3.W, KC0[3].Y, literal.z,
+; R600-NEXT:    2130706432(1.701412e+38), 975668412(6.390323e-04)
+; R600-NEXT:    -4096(nan), 0(0.000000e+00)
 ; R600-NEXT:     MUL_IEEE T3.X, PS, literal.x,
-; R600-NEXT:     MUL_IEEE T2.Y, T1.Y, literal.y,
-; R600-NEXT:     CNDE_INT T3.Z, T4.Y, T4.X, PV.W, BS:VEC_120/SCL_212
-; R600-NEXT:     CNDE_INT T0.W, T2.W, T2.Z, T1.Z,
-; R600-NEXT:     LSHL * T2.W, T3.Y, literal.z,
-; R600-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
-; R600-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; R600-NEXT:     ADD_INT T4.X, PS, literal.x,
-; R600-NEXT:     CNDE_INT T3.Y, T5.W, PV.W, PV.Z,
-; R600-NEXT:     RNDNE T1.Z, PV.Y,
-; R600-NEXT:     MULADD_IEEE T0.W, T3.W, literal.y, PV.X, BS:VEC_120/SCL_212
-; R600-NEXT:     MUL_IEEE * T2.W, T2.X, literal.z,
+; R600-NEXT:     ADD T0.Y, PV.W, PV.Z,
+; R600-NEXT:     CNDE_INT T0.Z, T0.W, PV.Y, T0.X, BS:VEC_021/SCL_122
+; R600-NEXT:     CNDE_INT T0.W, T1.Y, T4.W, PV.X,
+; R600-NEXT:     LSHL * T2.W, T1.X, literal.y,
+; R600-NEXT:    1079283712(3.321289e+00), 23(3.222986e-44)
+; R600-NEXT:     AND_INT T0.X, KC0[3].W, literal.x,
+; R600-NEXT:     TRUNC T1.Y, T3.Y,
+; R600-NEXT:     ADD_INT T1.Z, PS, literal.y,
+; R600-NEXT:     CNDE_INT T0.W, T1.W, PV.Z, PV.W,
+; R600-NEXT:     EXP_IEEE * T0.Y, PV.Y,
+; R600-NEXT:    -4096(nan), 1065353216(1.000000e+00)
+; R600-NEXT:     MUL_IEEE T1.X, PV.W, PV.Z,
+; R600-NEXT:     FLT_TO_INT T1.Y, PV.Y,
+; R600-NEXT:     MUL_IEEE T0.Z, PS, literal.x,
+; R600-NEXT:     ADD T0.W, KC0[3].W, -PV.X,
+; R600-NEXT:     RNDNE * T1.W, T3.X,
+; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; R600-NEXT:     SETGT T2.X, literal.x, KC0[3].Z,
+; R600-NEXT:     TRUNC T2.Y, PS,
+; R600-NEXT:     MUL_IEEE T1.Z, PV.W, literal.y,
+; R600-NEXT:     MUL_IEEE T2.W, PV.Z, literal.z,
+; R600-NEXT:     MAX_INT * T4.W, PV.Y, literal.w,
+; R600-NEXT:    -1036817932(-4.485347e+01), 975668412(6.390323e-04)
+; R600-NEXT:    209715200(1.972152e-31), -330(nan)
+; R600-NEXT:     ADD T4.X, KC0[3].Y, -T3.W,
+; R600-NEXT:     ADD_INT T3.Y, PS, literal.x,
+; R600-NEXT:     ADD_INT T2.Z, T1.Y, literal.y,
+; R600-NEXT:     SETGT_UINT T4.W, T1.Y, literal.z,
+; R600-NEXT:     MIN_INT * T5.W, T1.Y, literal.w,
+; R600-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; R600-NEXT:    -229(nan), 381(5.338947e-43)
+; R600-NEXT:     ADD_INT T5.X, PS, literal.x,
+; R600-NEXT:     ADD_INT T4.Y, T1.Y, literal.y,
+; R600-NEXT:     SETGT_UINT T3.Z, T1.Y, literal.z,
+; R600-NEXT:     CNDE_INT T5.W, PV.W, PV.Y, PV.Z,
+; R600-NEXT:     SETGT_INT * T6.W, T1.Y, literal.y,
+; R600-NEXT:    -254(nan), -127(nan)
+; R600-NEXT:    254(3.559298e-43), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T6.X, T0.Y, literal.x,
+; R600-NEXT:     CNDE_INT T3.Y, PS, PV.W, T1.Y,
+; R600-NEXT:     CNDE_INT * T2.Z, PV.Z, PV.Y, PV.X,
+; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
+; R600-NEXT:    ALU clause starting at 105:
+; R600-NEXT:     SETGT_INT T5.W, T1.Y, literal.x,
+; R600-NEXT:     MUL_IEEE * T7.W, T4.X, literal.y,
+; R600-NEXT:    127(1.779649e-43), 975668412(6.390323e-04)
+; R600-NEXT:     MUL_IEEE T5.X, T0.X, literal.x,
+; R600-NEXT:     MULADD_IEEE T1.Y, T4.X, literal.x, PS, BS:VEC_120/SCL_212
+; R600-NEXT:     CNDE_INT T2.Z, PV.W, T3.Y, T2.Z,
+; R600-NEXT:     MUL_IEEE T7.W, T6.X, literal.y, BS:VEC_201
+; R600-NEXT:     CNDE_INT * T2.W, T4.W, T2.W, T0.Z,
+; R600-NEXT:    1079283712(3.321289e+00), 2130706432(1.701412e+38)
+; R600-NEXT:     CNDE_INT T4.X, T6.W, PS, T0.Y,
+; R600-NEXT:     CNDE_INT T0.Y, T3.Z, T6.X, PV.W,
+; R600-NEXT:     LSHL T0.Z, PV.Z, literal.x,
+; R600-NEXT:     MULADD_IEEE T2.W, T3.W, literal.y, PV.Y, BS:VEC_201
+; R600-NEXT:     ADD * T1.W, T3.X, -T1.W,
+; R600-NEXT:    23(3.222986e-44), 975668412(6.390323e-04)
+; R600-NEXT:     ADD T3.X, PS, PV.W,
+; R600-NEXT:     ADD_INT T1.Y, PV.Z, literal.x,
+; R600-NEXT:     CNDE_INT T0.Z, T5.W, PV.X, PV.Y,
+; R600-NEXT:     RNDNE T1.W, T5.X,
+; R600-NEXT:     MULADD_IEEE * T0.W, T0.W, literal.y, T1.Z, BS:VEC_021/SCL_122
 ; R600-NEXT:    1065353216(1.000000e+00), 1079283712(3.321289e+00)
-; R600-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; R600-NEXT:     MULADD_IEEE T2.X, T2.X, literal.x, PS,
-; R600-NEXT:     MULADD_IEEE T1.Y, T1.Y, literal.y, PV.W,
-; R600-NEXT:     ADD T2.Z, T2.Y, -PV.Z, BS:VEC_120/SCL_212
-; R600-NEXT:     MUL_IEEE T0.W, PV.Y, PV.X,
-; R600-NEXT:     SETGT * T2.W, literal.z, KC0[4].X,
-; R600-NEXT:    1079283712(3.321289e+00), 975668412(6.390323e-04)
-; R600-NEXT:    -1036817932(-4.485347e+01), 0(0.000000e+00)
-; R600-NEXT:     CNDE T3.X, PS, PV.W, 0.0,
-; R600-NEXT:     ADD T1.Y, PV.Z, PV.Y,
-; R600-NEXT:     TRUNC T1.Z, T1.Z,
-; R600-NEXT:     MULADD_IEEE T0.W, T0.Z, literal.x, PV.X, BS:VEC_120/SCL_212
-; R600-NEXT:     ADD * T1.W, T1.W, -T1.X,
-; R600-NEXT:    975668412(6.390323e-04), 0(0.000000e+00)
-; R600-NEXT:     SETGT T2.X, KC0[4].X, literal.x,
-; R600-NEXT:     ADD T2.Y, PS, PV.W,
-; R600-NEXT:     FLT_TO_INT T0.Z, PV.Z,
-; R600-NEXT:     TRUNC T0.W, T1.X,
-; R600-NEXT:     EXP_IEEE * T1.X, PV.Y,
-; R600-NEXT:    1109008539(3.853184e+01), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T4.X, PS, literal.x,
-; R600-NEXT:     FLT_TO_INT T1.Y, PV.W,
-; R600-NEXT:     MAX_INT T1.Z, PV.Z, literal.y,
-; R600-NEXT:     MUL_IEEE T0.W, PS, literal.z,
-; R600-NEXT:     EXP_IEEE * T1.W, PV.Y,
-; R600-NEXT:    2130706432(1.701412e+38), -330(nan)
+; R600-NEXT:     MULADD_IEEE T0.X, T0.X, literal.x, PS,
+; R600-NEXT:     ADD T0.Y, T5.X, -PV.W, BS:VEC_120/SCL_212
+; R600-NEXT:     MUL_IEEE T0.Z, PV.Z, PV.Y,
+; R600-NEXT:     SETGT T0.W, literal.y, KC0[4].X,
+; R600-NEXT:     EXP_IEEE * T1.Y, PV.X,
+; R600-NEXT:    975668412(6.390323e-04), -1036817932(-4.485347e+01)
+; R600-NEXT:     CNDE T3.X, PV.W, PV.Z, 0.0,
+; R600-NEXT:     ADD T0.Y, PV.Y, PV.X,
+; R600-NEXT:     FLT_TO_INT T0.Z, T2.Y,
+; R600-NEXT:     TRUNC T0.W, T1.W,
+; R600-NEXT:     MUL_IEEE * T1.W, PS, literal.x,
 ; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T5.X, PV.W, literal.x,
-; R600-NEXT:     MUL_IEEE T2.Y, PS, literal.x,
-; R600-NEXT:     ADD_INT T1.Z, PV.Z, literal.y,
-; R600-NEXT:     ADD_INT T2.W, T0.Z, literal.z,
-; R600-NEXT:     MAX_INT * T3.W, PV.Y, literal.w,
-; R600-NEXT:    209715200(1.972152e-31), 204(2.858649e-43)
-; R600-NEXT:    102(1.429324e-43), -330(nan)
-; R600-NEXT:     SETGT_UINT T6.X, T0.Z, literal.x,
-; R600-NEXT:     ADD_INT T3.Y, PS, literal.y,
-; R600-NEXT:     ADD_INT T2.Z, T1.Y, literal.z,
-; R600-NEXT:     SETGT_UINT T3.W, T1.Y, literal.x,
-; R600-NEXT:     MIN_INT * T4.W, T1.Y, literal.w,
+; R600-NEXT:     SETGT T0.X, KC0[4].X, literal.x,
+; R600-NEXT:     MUL_IEEE T2.Y, PS, literal.y,
+; R600-NEXT:     FLT_TO_INT T1.Z, PV.W,
+; R600-NEXT:     MAX_INT T0.W, PV.Z, literal.z,
+; R600-NEXT:     EXP_IEEE * T0.Y, PV.Y,
+; R600-NEXT:    1109008539(3.853184e+01), 209715200(1.972152e-31)
+; R600-NEXT:    -330(nan), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T4.X, T1.Y, literal.x,
+; R600-NEXT:     MUL_IEEE T3.Y, PS, literal.y,
+; R600-NEXT:     ADD_INT T2.Z, PV.W, literal.z,
+; R600-NEXT:     ADD_INT * T0.W, T0.Z, literal.w,
+; R600-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
+; R600-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; R600-NEXT:     MAX_INT * T2.W, T1.Z, literal.x,
+; R600-NEXT:    -330(nan), 0(0.000000e+00)
+; R600-NEXT:     SETGT_UINT T5.X, T0.Z, literal.x,
+; R600-NEXT:     ADD_INT T4.Y, PV.W, literal.y,
+; R600-NEXT:     ADD_INT T3.Z, T1.Z, literal.z, BS:VEC_120/SCL_212
+; R600-NEXT:     SETGT_UINT T2.W, T1.Z, literal.x, BS:VEC_120/SCL_212
+; R600-NEXT:     MIN_INT * T3.W, T1.Z, literal.w,
 ; R600-NEXT:    -229(nan), 204(2.858649e-43)
 ; R600-NEXT:    102(1.429324e-43), 381(5.338947e-43)
-; R600-NEXT:     ADD_INT T7.X, PS, literal.x,
-; R600-NEXT:     ADD_INT T4.Y, T1.Y, literal.y,
-; R600-NEXT:     SETGT_UINT T3.Z, T1.Y, literal.z,
-; R600-NEXT:     CNDE_INT T4.W, PV.W, PV.Y, PV.Z,
-; R600-NEXT:     SETGT_INT * T5.W, T1.Y, literal.y,
+; R600-NEXT:     ADD_INT T6.X, PS, literal.x,
+; R600-NEXT:     ADD_INT T5.Y, T1.Z, literal.y,
+; R600-NEXT:     SETGT_UINT T4.Z, T1.Z, literal.z,
+; R600-NEXT:     CNDE_INT T3.W, PV.W, PV.Y, PV.Z,
+; R600-NEXT:     SETGT_INT * T4.W, T1.Z, literal.y,
 ; R600-NEXT:    -254(nan), -127(nan)
 ; R600-NEXT:    254(3.559298e-43), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T8.X, PS, PV.W, T1.Y,
-; R600-NEXT:     CNDE_INT T3.Y, PV.Z, PV.Y, PV.X,
-; R600-NEXT:     SETGT_INT T2.Z, T1.Y, literal.x,
-; R600-NEXT:     CNDE_INT T2.W, T6.X, T1.Z, T2.W,
-; R600-NEXT:     SETGT_INT * T4.W, T0.Z, literal.y,
+; R600-NEXT:     CNDE_INT T7.X, PS, PV.W, T1.Z, BS:VEC_021/SCL_122
+; R600-NEXT:     CNDE_INT T4.Y, PV.Z, PV.Y, PV.X,
+; R600-NEXT:     SETGT_INT T1.Z, T1.Z, literal.x, BS:VEC_120/SCL_212
+; R600-NEXT:     CNDE_INT T0.W, T5.X, T2.Z, T0.W, BS:VEC_102/SCL_221
+; R600-NEXT:     SETGT_INT * T3.W, T0.Z, literal.y,
 ; R600-NEXT:    127(1.779649e-43), -127(nan)
-; R600-NEXT:     CNDE_INT T7.X, PS, PV.W, T0.Z,
-; R600-NEXT:     CNDE_INT T1.Y, PV.Z, PV.X, PV.Y,
-; R600-NEXT:     MIN_INT T1.Z, T0.Z, literal.x,
-; R600-NEXT:     MUL_IEEE T2.W, T1.W, literal.y,
-; R600-NEXT:     MUL_IEEE * T6.W, T2.Y, literal.z,
-; R600-NEXT:    381(5.338947e-43), 2130706432(1.701412e+38)
-; R600-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T8.X, T3.W, PS, T2.Y,
-; R600-NEXT:     MUL_IEEE T2.Y, PV.W, literal.x,
-; R600-NEXT:     ADD_INT T1.Z, PV.Z, literal.y,
-; R600-NEXT:     ADD_INT T3.W, T0.Z, literal.z,
-; R600-NEXT:     SETGT_UINT * T6.W, T0.Z, literal.w,
+; R600-NEXT:     CNDE_INT T6.X, PS, PV.W, T0.Z,
+; R600-NEXT:     CNDE_INT T4.Y, PV.Z, PV.X, PV.Y,
+; R600-NEXT:     MIN_INT T2.Z, T0.Z, literal.x,
+; R600-NEXT:     MUL_IEEE T0.W, T3.Y, literal.y,
+; R600-NEXT:     MUL_IEEE * T5.W, T0.Y, literal.z,
+; R600-NEXT:    381(5.338947e-43), 209715200(1.972152e-31)
+; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
+; R600-NEXT:     MUL_IEEE T7.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T3.Y, T2.W, PV.W, T3.Y,
+; R600-NEXT:     ADD_INT T2.Z, PV.Z, literal.y,
+; R600-NEXT:     ADD_INT T0.W, T0.Z, literal.z,
+; R600-NEXT:     SETGT_UINT * T2.W, T0.Z, literal.w,
 ; R600-NEXT:    2130706432(1.701412e+38), -254(nan)
 ; R600-NEXT:    -127(nan), 254(3.559298e-43)
-; R600-NEXT:     CNDE_INT T9.X, PS, PV.W, PV.Z,
-; R600-NEXT:     SETGT_INT T3.Y, T0.Z, literal.x,
-; R600-NEXT:     CNDE_INT T0.Z, T3.Z, T2.W, PV.Y, BS:VEC_120/SCL_212
-; R600-NEXT:     CNDE_INT T1.W, T5.W, PV.X, T1.W, BS:VEC_021/SCL_122
-; R600-NEXT:     LSHL * T2.W, T1.Y, literal.y,
+; R600-NEXT:     CNDE_INT T8.X, PS, PV.W, PV.Z,
+; R600-NEXT:     SETGT_INT T5.Y, T0.Z, literal.x,
+; R600-NEXT:     CNDE_INT T0.Z, T4.W, PV.Y, T0.Y, BS:VEC_021/SCL_122
+; R600-NEXT:     CNDE_INT T0.W, T4.Z, T5.W, PV.X, BS:VEC_120/SCL_212
+; R600-NEXT:     LSHL * T4.W, T4.Y, literal.y,
 ; R600-NEXT:    127(1.779649e-43), 23(3.222986e-44)
-; R600-NEXT:     ADD_INT T8.X, PS, literal.x,
-; R600-NEXT:     CNDE_INT T1.Y, T2.Z, PV.W, PV.Z,
-; R600-NEXT:     CNDE_INT T0.Z, PV.Y, T7.X, PV.X,
-; R600-NEXT:     CNDE_INT * T0.W, T6.X, T5.X, T0.W, BS:VEC_021/SCL_122
-; R600-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE * T1.W, T4.X, literal.x,
-; R600-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; R600-NEXT:     CNDE_INT T4.X, T6.W, T4.X, PV.W,
-; R600-NEXT:     CNDE_INT * T2.Y, T4.W, T0.W, T1.X, BS:VEC_120/SCL_212
-; R600-NEXT:    ALU clause starting at 204:
+; R600-NEXT:     ADD_INT T7.X, PS, literal.x,
+; R600-NEXT:     CNDE_INT T0.Y, T1.Z, PV.Z, PV.W,
+; R600-NEXT:     CNDE_INT T0.Z, PV.Y, T6.X, PV.X,
+; R600-NEXT:     MUL_IEEE T0.W, T4.X, literal.y,
+; R600-NEXT:     CNDE_INT * T1.W, T5.X, T2.Y, T1.W,
+; R600-NEXT:    1065353216(1.000000e+00), 2130706432(1.701412e+38)
+; R600-NEXT:     CNDE_INT T5.X, T3.W, PS, T1.Y,
+; R600-NEXT:     CNDE_INT * T1.Y, T2.W, T4.X, PV.W, BS:VEC_120/SCL_212
+; R600-NEXT:    ALU clause starting at 201:
 ; R600-NEXT:     LSHL T0.Z, T0.Z, literal.x,
-; R600-NEXT:     MUL_IEEE T0.W, T1.Y, T8.X,
+; R600-NEXT:     MUL_IEEE T0.W, T0.Y, T7.X,
 ; R600-NEXT:     SETGT * T1.W, literal.y, KC0[3].W,
 ; R600-NEXT:    23(3.222986e-44), -1036817932(-4.485347e+01)
-; R600-NEXT:     CNDE T1.X, PS, PV.W, 0.0,
-; R600-NEXT:     SETGT T1.Y, KC0[3].W, literal.x,
+; R600-NEXT:     CNDE T4.X, PS, PV.W, 0.0,
+; R600-NEXT:     SETGT T0.Y, KC0[3].W, literal.x,
 ; R600-NEXT:     ADD_INT T0.Z, PV.Z, literal.y,
-; R600-NEXT:     CNDE_INT T0.W, T3.Y, T2.Y, T4.X, BS:VEC_120/SCL_212
-; R600-NEXT:     CNDE * T1.W, T2.X, T3.X, literal.z,
+; R600-NEXT:     CNDE_INT T0.W, T5.Y, T5.X, T1.Y, BS:VEC_102/SCL_221
+; R600-NEXT:     CNDE * T1.W, T0.X, T3.X, literal.z,
 ; R600-NEXT:    1109008539(3.853184e+01), 1065353216(1.000000e+00)
 ; R600-NEXT:    2139095040(INF), 0(0.000000e+00)
-; R600-NEXT:     MUL_IEEE T2.X, PV.W, PV.Z,
+; R600-NEXT:     MUL_IEEE T0.X, PV.W, PV.Z,
 ; R600-NEXT:     SETGT T2.Y, literal.x, KC0[3].Y,
 ; R600-NEXT:     CNDE T1.Z, PV.Y, PV.X, literal.y,
-; R600-NEXT:     CNDE T0.W, T0.X, T0.Y, 0.0,
+; R600-NEXT:     CNDE T0.W, T2.X, T1.X, 0.0,
 ; R600-NEXT:     SETGT * T2.W, KC0[3].Z, literal.z,
 ; R600-NEXT:    -1036817932(-4.485347e+01), 2139095040(INF)
 ; R600-NEXT:    1109008539(3.853184e+01), 0(0.000000e+00)
@@ -2287,8 +2275,8 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; CM-LABEL: s_exp10_v4f32:
 ; CM:       ; %bb.0:
 ; CM-NEXT:    ALU 97, @6, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    ALU 100, @104, KC0[CB0:0-32], KC1[]
-; CM-NEXT:    ALU 36, @205, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 97, @104, KC0[CB0:0-32], KC1[]
+; CM-NEXT:    ALU 35, @202, KC0[CB0:0-32], KC1[]
 ; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
 ; CM-NEXT:    CF_END
 ; CM-NEXT:    PAD
@@ -2307,224 +2295,220 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
 ; CM-NEXT:     MULADD_IEEE T0.X, T0.W, literal.x, PV.W,
 ; CM-NEXT:     ADD T0.Y, T0.Z, -PV.Z,
-; CM-NEXT:     MUL_IEEE T0.Z, PV.Y, literal.x,
-; CM-NEXT:     MUL_IEEE * T0.W, T2.W, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     MUL_IEEE T0.Z, T2.W, literal.y, BS:VEC_120/SCL_212
+; CM-NEXT:     MUL_IEEE * T0.W, PV.Y, literal.x,
 ; CM-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
 ; CM-NEXT:     TRUNC T1.X, T1.Z,
-; CM-NEXT:     RNDNE T2.Y, PV.W,
-; CM-NEXT:     MULADD_IEEE T0.Z, T1.Y, literal.x, PV.Z,
-; CM-NEXT:     ADD * T1.W, PV.Y, PV.X,
+; CM-NEXT:     MULADD_IEEE T1.Y, T1.Y, literal.x, PV.W,
+; CM-NEXT:     RNDNE T1.Z, PV.Z,
+; CM-NEXT:     ADD * T0.W, PV.Y, PV.X,
 ; CM-NEXT:    1079283712(3.321289e+00), 0(0.000000e+00)
+; CM-NEXT:     EXP_IEEE T0.X, T0.W,
+; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
+; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
+; CM-NEXT:     TRUNC T2.X, T1.Z,
+; CM-NEXT:     MULADD_IEEE T0.Y, T2.W, literal.x, T1.Y,
+; CM-NEXT:     FLT_TO_INT T2.Z, T1.X,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.y,
+; CM-NEXT:    975668412(6.390323e-04), 209715200(1.972152e-31)
+; CM-NEXT:     ADD T1.X, T0.Z, -T1.Z,
+; CM-NEXT:     MUL_IEEE T1.Y, PV.W, literal.x,
+; CM-NEXT:     MAX_INT T0.Z, PV.Z, literal.y,
+; CM-NEXT:     MIN_INT * T1.W, PV.Z, literal.z,
+; CM-NEXT:    209715200(1.972152e-31), -330(nan)
+; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
+; CM-NEXT:     ADD_INT T3.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T0.Z, T2.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T1.W, T2.Z, literal.w,
+; CM-NEXT:    -254(nan), 204(2.858649e-43)
+; CM-NEXT:    102(1.429324e-43), -229(nan)
+; CM-NEXT:     ADD_INT T4.X, T2.Z, literal.x,
+; CM-NEXT:     SETGT_UINT T3.Y, T2.Z, literal.y,
+; CM-NEXT:     CNDE_INT T0.Z, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     SETGT_INT * T2.W, T2.Z, literal.x,
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     MUL_IEEE T5.X, T0.X, literal.x,
+; CM-NEXT:     CNDE_INT T2.Y, PV.W, PV.Z, T2.Z,
+; CM-NEXT:     CNDE_INT T0.Z, PV.Y, PV.X, T3.X,
+; CM-NEXT:     SETGT_INT * T3.W, T2.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 127(1.779649e-43)
+; CM-NEXT:     AND_INT T3.X, KC0[3].Z, literal.x,
+; CM-NEXT:     CNDE_INT T2.Y, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     MUL_IEEE T0.Z, PV.X, literal.y,
+; CM-NEXT:     CNDE_INT * T0.W, T1.W, T1.Y, T0.W,
+; CM-NEXT:    -4096(nan), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T0.X, T2.W, PV.W, T0.X,
+; CM-NEXT:     CNDE_INT T1.Y, T3.Y, T5.X, PV.Z,
+; CM-NEXT:     LSHL T0.Z, PV.Y, literal.x,
+; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.y,
+; CM-NEXT:    23(3.222986e-44), 1079283712(3.321289e+00)
+; CM-NEXT:     RNDNE T4.X, PV.W,
+; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.x,
+; CM-NEXT:     CNDE_INT T0.Z, T3.W, PV.X, PV.Y,
+; CM-NEXT:     ADD * T1.W, T1.X, T0.Y,
+; CM-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
 ; CM-NEXT:     EXP_IEEE T0.X, T1.W,
 ; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T1.W,
 ; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T1.W,
 ; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T1.W,
-; CM-NEXT:     MULADD_IEEE T2.X, T2.W, literal.x, T0.Z,
-; CM-NEXT:     ADD T0.Y, T0.W, -T2.Y, BS:VEC_120/SCL_212
-; CM-NEXT:     FLT_TO_INT T0.Z, T1.X,
-; CM-NEXT:     MUL_IEEE * T0.W, PV.X, literal.y,
-; CM-NEXT:    975668412(6.390323e-04), 209715200(1.972152e-31)
-; CM-NEXT:     MUL_IEEE T1.X, PV.W, literal.x,
+; CM-NEXT:     MUL_IEEE T1.X, T0.Z, T2.Y,
+; CM-NEXT:     TRUNC T0.Y, T4.X,
+; CM-NEXT:     FLT_TO_INT T0.Z, T2.X, BS:VEC_120/SCL_212
+; CM-NEXT:     MUL_IEEE * T1.W, PV.X, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T2.X, PV.W, literal.x,
 ; CM-NEXT:     MUL_IEEE T1.Y, T0.X, literal.y,
 ; CM-NEXT:     MAX_INT T1.Z, PV.Z, literal.z,
-; CM-NEXT:     MIN_INT * T1.W, PV.Z, literal.w,
+; CM-NEXT:     MIN_INT * T2.W, PV.Z, literal.w,
 ; CM-NEXT:    209715200(1.972152e-31), 2130706432(1.701412e+38)
 ; CM-NEXT:    -330(nan), 381(5.338947e-43)
-; CM-NEXT:     ADD_INT T3.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T3.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T5.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.y,
 ; CM-NEXT:     ADD_INT T1.Z, T0.Z, literal.z,
-; CM-NEXT:     SETGT_UINT * T1.W, T0.Z, literal.w,
+; CM-NEXT:     SETGT_UINT * T2.W, T0.Z, literal.w,
 ; CM-NEXT:    -254(nan), 204(2.858649e-43)
 ; CM-NEXT:    102(1.429324e-43), -229(nan)
-; CM-NEXT:     ADD_INT T4.X, T0.Z, literal.x,
-; CM-NEXT:     SETGT_UINT T4.Y, T0.Z, literal.y,
+; CM-NEXT:     ADD_INT T6.X, T0.Z, literal.x,
+; CM-NEXT:     SETGT_UINT T3.Y, T0.Z, literal.y,
 ; CM-NEXT:     CNDE_INT T1.Z, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     SETGT_INT * T2.W, T0.Z, literal.x,
+; CM-NEXT:     SETGT_INT * T3.W, T0.Z, literal.x,
 ; CM-NEXT:    -127(nan), 254(3.559298e-43)
-; CM-NEXT:     CNDE_INT T5.X, PV.W, PV.Z, T0.Z,
-; CM-NEXT:     CNDE_INT T3.Y, PV.Y, PV.X, T3.X,
-; CM-NEXT:     SETGT_INT T0.Z, T0.Z, literal.x,
-; CM-NEXT:     MUL_IEEE * T3.W, T1.Y, literal.y,
-; CM-NEXT:    127(1.779649e-43), 2130706432(1.701412e+38)
-; CM-NEXT:     CNDE_INT T3.X, T4.Y, T1.Y, PV.W,
-; CM-NEXT:     AND_INT T1.Y, KC0[3].Z, literal.x,
-; CM-NEXT:     CNDE_INT T1.Z, PV.Z, PV.X, PV.Y,
-; CM-NEXT:     CNDE_INT * T0.W, T1.W, T1.X, T0.W,
-; CM-NEXT:    -4096(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T0.X, T2.W, PV.W, T0.X,
-; CM-NEXT:     LSHL T3.Y, PV.Z, literal.x,
-; CM-NEXT:     TRUNC T1.Z, T2.Y,
-; CM-NEXT:     ADD * T0.W, KC0[3].Z, -PV.Y,
-; CM-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T1.X, PV.W, literal.x,
-; CM-NEXT:     FLT_TO_INT T2.Y, PV.Z,
-; CM-NEXT:     ADD_INT T1.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T1.W, T0.Z, PV.X, T3.X,
-; CM-NEXT:    975668412(6.390323e-04), 1065353216(1.000000e+00)
-; CM-NEXT:     MUL_IEEE T0.X, PV.W, PV.Z,
-; CM-NEXT:     MIN_INT T3.Y, PV.Y, literal.x,
-; CM-NEXT:     MULADD_IEEE T0.Z, T0.W, literal.y, PV.X,
-; CM-NEXT:     ADD * T0.W, T0.Y, T2.X,
-; CM-NEXT:    381(5.338947e-43), 1079283712(3.321289e+00)
-; CM-NEXT:     EXP_IEEE T0.X (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE T0.Y, T0.W,
-; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
-; CM-NEXT:     EXP_IEEE * T0.W (MASKED), T0.W,
-; CM-NEXT:     MULADD_IEEE T1.X, T1.Y, literal.x, T0.Z,
-; CM-NEXT:     MUL_IEEE T4.Y, PV.Y, literal.y,
-; CM-NEXT:     ADD_INT T0.Z, T3.Y, literal.z, BS:VEC_120/SCL_212
-; CM-NEXT:     MAX_INT * T0.W, T2.Y, literal.w, BS:VEC_201
-; CM-NEXT:    975668412(6.390323e-04), 2130706432(1.701412e+38)
-; CM-NEXT:    -254(nan), -330(nan)
-; CM-NEXT:     ADD_INT T2.X, T2.Y, literal.x,
-; CM-NEXT:     ADD_INT T3.Y, PV.W, literal.y,
-; CM-NEXT:     ADD_INT T1.Z, T2.Y, literal.z,
-; CM-NEXT:     SETGT_UINT * T0.W, T2.Y, literal.w,
-; CM-NEXT:    -127(nan), 204(2.858649e-43)
-; CM-NEXT:    102(1.429324e-43), -229(nan)
-; CM-NEXT:     SETGT_UINT T3.X, T2.Y, literal.x,
-; CM-NEXT:     CNDE_INT T3.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     SETGT_INT T1.Z, T2.Y, literal.y,
-; CM-NEXT:     MUL_IEEE * T1.W, T0.Y, literal.z, BS:VEC_120/SCL_212
-; CM-NEXT:    254(3.559298e-43), -127(nan)
-; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T4.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT * T3.Y, PV.Z, PV.Y, T2.Y,
-; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
-; CM-NEXT:    ALU clause starting at 104:
-; CM-NEXT:     CNDE_INT T0.Z, T3.X, T2.X, T0.Z,
-; CM-NEXT:     SETGT_INT * T2.W, T2.Y, literal.x,
+; CM-NEXT:     CNDE_INT T7.X, PV.W, PV.Z, T0.Z,
+; CM-NEXT:     CNDE_INT T2.Y, PV.Y, PV.X, T5.X,
+; CM-NEXT:     SETGT_INT * T0.Z, T0.Z, literal.x,
 ; CM-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T2.X, T1.Y, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.W, T3.Y, PV.Z,
-; CM-NEXT:     CNDE_INT T0.Z, T0.W, T4.X, T1.W,
-; CM-NEXT:     MUL_IEEE * T0.W, T4.Y, literal.y, BS:VEC_201
-; CM-NEXT:    1079283712(3.321289e+00), 2130706432(1.701412e+38)
-; CM-NEXT:     AND_INT T4.X, KC0[4].X, literal.x,
-; CM-NEXT:     CNDE_INT T2.Y, T3.X, T4.Y, PV.W,
-; CM-NEXT:     CNDE_INT T0.Z, T1.Z, PV.Z, T0.Y,
-; CM-NEXT:     LSHL * T0.W, PV.Y, literal.y,
-; CM-NEXT:    -4096(nan), 23(3.222986e-44)
-; CM-NEXT:     ADD_INT T3.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T0.Y, T2.W, PV.Z, PV.Y,
-; CM-NEXT:     MUL_IEEE T0.Z, PV.X, literal.y,
-; CM-NEXT:     RNDNE * T0.W, T2.X,
-; CM-NEXT:    1065353216(1.000000e+00), 1079283712(3.321289e+00)
-; CM-NEXT:     ADD T2.X, T2.X, -PV.W,
-; CM-NEXT:     RNDNE T1.Y, PV.Z,
-; CM-NEXT:     MUL_IEEE T1.Z, PV.Y, PV.X,
-; CM-NEXT:     SETGT * T1.W, literal.x, KC0[3].W,
-; CM-NEXT:    -1036817932(-4.485347e+01), 0(0.000000e+00)
-; CM-NEXT:     CNDE T3.X, PV.W, PV.Z, 0.0,
-; CM-NEXT:     TRUNC T0.Y, T0.W,
-; CM-NEXT:     TRUNC T1.Z, PV.Y,
-; CM-NEXT:     ADD * T0.W, PV.X, T1.X,
+; CM-NEXT:    ALU clause starting at 104:
+; CM-NEXT:     ADD * T4.W, KC0[3].Z, -T3.X,
+; CM-NEXT:     MUL_IEEE T5.X, PV.W, literal.x,
+; CM-NEXT:     CNDE_INT T2.Y, T0.Z, T7.X, T2.Y,
+; CM-NEXT:     MUL_IEEE T1.Z, T1.Y, literal.y,
+; CM-NEXT:     CNDE_INT * T1.W, T2.W, T2.X, T1.W, BS:VEC_021/SCL_122
+; CM-NEXT:    975668412(6.390323e-04), 2130706432(1.701412e+38)
+; CM-NEXT:     CNDE_INT T0.X, T3.W, PV.W, T0.X,
+; CM-NEXT:     CNDE_INT T1.Y, T3.Y, T1.Y, PV.Z,
+; CM-NEXT:     LSHL T1.Z, PV.Y, literal.x,
+; CM-NEXT:     MULADD_IEEE * T1.W, T4.W, literal.y, PV.X, BS:VEC_120/SCL_212
+; CM-NEXT:    23(3.222986e-44), 1079283712(3.321289e+00)
+; CM-NEXT:     MULADD_IEEE T2.X, T3.X, literal.x, PV.W,
+; CM-NEXT:     ADD T2.Y, T0.W, -T4.X,
+; CM-NEXT:     ADD_INT T1.Z, PV.Z, literal.y,
+; CM-NEXT:     CNDE_INT * T0.W, T0.Z, PV.X, PV.Y,
+; CM-NEXT:    975668412(6.390323e-04), 1065353216(1.000000e+00)
+; CM-NEXT:     AND_INT T0.X, KC0[4].X, literal.x,
+; CM-NEXT:     MUL_IEEE T1.Y, PV.W, PV.Z,
+; CM-NEXT:     SETGT T0.Z, literal.y, KC0[3].W,
+; CM-NEXT:     ADD * T0.W, PV.Y, PV.X,
+; CM-NEXT:    -4096(nan), -1036817932(-4.485347e+01)
 ; CM-NEXT:     EXP_IEEE T0.X (MASKED), T0.W,
 ; CM-NEXT:     EXP_IEEE T0.Y (MASKED), T0.W,
 ; CM-NEXT:     EXP_IEEE T0.Z (MASKED), T0.W,
 ; CM-NEXT:     EXP_IEEE * T0.W, T0.W,
-; CM-NEXT:     FLT_TO_INT T1.X, T1.Z,
-; CM-NEXT:     FLT_TO_INT T0.Y, T0.Y,
-; CM-NEXT:     MUL_IEEE T1.Z, PV.W, literal.x,
-; CM-NEXT:     ADD * T1.W, KC0[4].X, -T4.X,
-; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     MUL_IEEE T2.X, PV.W, literal.x,
-; CM-NEXT:     MUL_IEEE T2.Y, T0.W, literal.y,
-; CM-NEXT:     MUL_IEEE T2.Z, PV.Z, literal.z,
-; CM-NEXT:     SETGT_UINT * T2.W, PV.Y, literal.w,
-; CM-NEXT:    975668412(6.390323e-04), 209715200(1.972152e-31)
-; CM-NEXT:    2130706432(1.701412e+38), 254(3.559298e-43)
-; CM-NEXT:     CNDE_INT T5.X, PV.W, T1.Z, PV.Z,
-; CM-NEXT:     MUL_IEEE T3.Y, PV.Y, literal.x,
-; CM-NEXT:     MULADD_IEEE T1.Z, T1.W, literal.y, PV.X,
-; CM-NEXT:     MAX_INT * T1.W, T1.X, literal.z,
-; CM-NEXT:    209715200(1.972152e-31), 1079283712(3.321289e+00)
-; CM-NEXT:    -330(nan), 0(0.000000e+00)
-; CM-NEXT:     ADD_INT T2.X, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T4.Y, T1.X, literal.y,
-; CM-NEXT:     MULADD_IEEE T1.Z, T4.X, literal.z, PV.Z, BS:VEC_120/SCL_212
-; CM-NEXT:     MAX_INT * T1.W, T0.Y, literal.w,
-; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
+; CM-NEXT:     CNDE T2.X, T0.Z, T1.Y, 0.0,
+; CM-NEXT:     ADD T1.Y, KC0[4].X, -T0.X,
+; CM-NEXT:     FLT_TO_INT T0.Z, T0.Y,
+; CM-NEXT:     MUL_IEEE * T1.W, PV.W, literal.x,
+; CM-NEXT:    209715200(1.972152e-31), 0(0.000000e+00)
+; CM-NEXT:     MUL_IEEE T3.X, PV.W, literal.x,
+; CM-NEXT:     SETGT_UINT T0.Y, PV.Z, literal.y,
+; CM-NEXT:     MUL_IEEE T1.Z, PV.Y, literal.z,
+; CM-NEXT:     MUL_IEEE * T2.W, T0.X, literal.w,
+; CM-NEXT:    209715200(1.972152e-31), -229(nan)
+; CM-NEXT:    975668412(6.390323e-04), 1079283712(3.321289e+00)
+; CM-NEXT:     RNDNE T4.X, PV.W,
+; CM-NEXT:     MULADD_IEEE T1.Y, T1.Y, literal.x, PV.Z,
+; CM-NEXT:     CNDE_INT T1.Z, PV.Y, PV.X, T1.W,
+; CM-NEXT:     SETGT_INT * T1.W, T0.Z, literal.y,
+; CM-NEXT:    1079283712(3.321289e+00), -127(nan)
+; CM-NEXT:     CNDE_INT T3.X, PV.W, PV.Z, T0.W,
+; CM-NEXT:     MULADD_IEEE T1.Y, T0.X, literal.x, PV.Y,
+; CM-NEXT:     ADD T1.Z, T2.W, -PV.X,
+; CM-NEXT:     MAX_INT * T2.W, T0.Z, literal.y,
 ; CM-NEXT:    975668412(6.390323e-04), -330(nan)
-; CM-NEXT:     ADD T4.X, T0.Z, -T1.Y,
-; CM-NEXT:     ADD_INT T1.Y, PV.W, literal.x,
-; CM-NEXT:     ADD_INT T0.Z, T0.Y, literal.y,
-; CM-NEXT:     SETGT_UINT * T1.W, T0.Y, literal.z,
+; CM-NEXT:     ADD_INT T0.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T2.Y, T0.Z, literal.y,
+; CM-NEXT:     TRUNC T2.Z, T4.X,
+; CM-NEXT:     ADD * T2.W, PV.Z, PV.Y,
 ; CM-NEXT:    204(2.858649e-43), 102(1.429324e-43)
-; CM-NEXT:    -229(nan), 0(0.000000e+00)
-; CM-NEXT:     SETGT_UINT T6.X, T1.X, literal.x,
-; CM-NEXT:     CNDE_INT T1.Y, PV.W, PV.Y, PV.Z,
-; CM-NEXT:     SETGT_INT T0.Z, T0.Y, literal.y,
-; CM-NEXT:     ADD * T3.W, PV.X, T1.Z,
-; CM-NEXT:    -229(nan), -127(nan)
-; CM-NEXT:     EXP_IEEE T1.X (MASKED), T3.W,
-; CM-NEXT:     EXP_IEEE T1.Y (MASKED), T3.W,
-; CM-NEXT:     EXP_IEEE T1.Z, T3.W,
-; CM-NEXT:     EXP_IEEE * T1.W (MASKED), T3.W,
-; CM-NEXT:     CNDE_INT T4.X, T0.Z, T1.Y, T0.Y,
-; CM-NEXT:     CNDE_INT T1.Y, T6.X, T2.X, T4.Y, BS:VEC_120/SCL_212
-; CM-NEXT:     SETGT_INT T2.Z, T1.X, literal.x,
-; CM-NEXT:     MUL_IEEE * T3.W, PV.Z, literal.y,
-; CM-NEXT:    -127(nan), 209715200(1.972152e-31)
-; CM-NEXT:     MUL_IEEE T2.X, T1.Z, literal.x,
-; CM-NEXT:     MUL_IEEE T4.Y, PV.W, literal.y,
-; CM-NEXT:     CNDE_INT T3.Z, PV.Z, PV.Y, T1.X,
-; CM-NEXT:     MIN_INT * T4.W, T1.X, literal.z,
+; CM-NEXT:     EXP_IEEE T1.X (MASKED), T2.W,
+; CM-NEXT:     EXP_IEEE T1.Y, T2.W,
+; CM-NEXT:     EXP_IEEE T1.Z (MASKED), T2.W,
+; CM-NEXT:     EXP_IEEE * T1.W (MASKED), T2.W,
+; CM-NEXT:     MUL_IEEE T4.X, T0.W, literal.x,
+; CM-NEXT:     FLT_TO_INT T3.Y, T2.Z,
+; CM-NEXT:     MUL_IEEE T1.Z, PV.Y, literal.y,
+; CM-NEXT:     CNDE_INT * T0.W, T0.Y, T0.X, T2.Y,
 ; CM-NEXT:    2130706432(1.701412e+38), 209715200(1.972152e-31)
+; CM-NEXT:     CNDE_INT T0.X, T1.W, PV.W, T0.Z,
+; CM-NEXT:     MUL_IEEE T0.Y, PV.Z, literal.x,
+; CM-NEXT:     MAX_INT T2.Z, PV.Y, literal.y,
+; CM-NEXT:     MIN_INT * T0.W, PV.Y, literal.z,
+; CM-NEXT:    209715200(1.972152e-31), -330(nan)
 ; CM-NEXT:    381(5.338947e-43), 0(0.000000e+00)
-; CM-NEXT:     MIN_INT T7.X, T0.Y, literal.x,
-; CM-NEXT:     ADD_INT T1.Y, PV.W, literal.y,
-; CM-NEXT:     ADD_INT T4.Z, T1.X, literal.z,
-; CM-NEXT:     SETGT_UINT * T4.W, T1.X, literal.w,
-; CM-NEXT:    381(5.338947e-43), -254(nan)
+; CM-NEXT:     ADD_INT T5.X, PV.W, literal.x,
+; CM-NEXT:     ADD_INT T2.Y, PV.Z, literal.y,
+; CM-NEXT:     ADD_INT T2.Z, T3.Y, literal.z,
+; CM-NEXT:     SETGT_UINT * T0.W, T3.Y, literal.w,
+; CM-NEXT:    -254(nan), 204(2.858649e-43)
+; CM-NEXT:    102(1.429324e-43), -229(nan)
+; CM-NEXT:     ADD_INT T6.X, T3.Y, literal.x,
+; CM-NEXT:     SETGT_UINT T4.Y, T3.Y, literal.y,
+; CM-NEXT:     CNDE_INT T2.Z, PV.W, PV.Y, PV.Z,
+; CM-NEXT:     SETGT_INT * T1.W, T3.Y, literal.x,
 ; CM-NEXT:    -127(nan), 254(3.559298e-43)
-; CM-NEXT:     CNDE_INT T8.X, PV.W, PV.Z, PV.Y,
-; CM-NEXT:     SETGT_INT T1.Y, T1.X, literal.x,
-; CM-NEXT:     ADD_INT T4.Z, PV.X, literal.y,
-; CM-NEXT:     ADD_INT * T5.W, T0.Y, literal.z,
+; CM-NEXT:     MUL_IEEE T7.X, T1.Y, literal.x,
+; CM-NEXT:     CNDE_INT T2.Y, PV.W, PV.Z, T3.Y,
+; CM-NEXT:     CNDE_INT T2.Z, PV.Y, PV.X, T5.X,
+; CM-NEXT:     MIN_INT * T2.W, T0.Z, literal.y,
+; CM-NEXT:    2130706432(1.701412e+38), 381(5.338947e-43)
+; CM-NEXT:     SETGT_INT T5.X, T3.Y, literal.x,
+; CM-NEXT:     ADD_INT T3.Y, PV.W, literal.y,
+; CM-NEXT:     ADD_INT T3.Z, T0.Z, literal.z,
+; CM-NEXT:     SETGT_UINT * T2.W, T0.Z, literal.w,
 ; CM-NEXT:    127(1.779649e-43), -254(nan)
-; CM-NEXT:    -127(nan), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT T1.X, T2.W, PV.W, PV.Z,
-; CM-NEXT:     CNDE_INT T5.Y, PV.Y, T3.Z, PV.X,
-; CM-NEXT:     CNDE_INT T3.Z, T6.X, T4.Y, T3.W,
-; CM-NEXT:     MUL_IEEE * T2.W, T2.X, literal.x, BS:VEC_120/SCL_212
+; CM-NEXT:    -127(nan), 254(3.559298e-43)
+; CM-NEXT:     CNDE_INT T6.X, PV.W, PV.Z, PV.Y,
+; CM-NEXT:     CNDE_INT T2.Y, PV.X, T2.Y, T2.Z,
+; CM-NEXT:     MUL_IEEE T2.Z, T7.X, literal.x,
+; CM-NEXT:     CNDE_INT * T0.W, T0.W, T0.Y, T1.Z, BS:VEC_021/SCL_122
 ; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
-; CM-NEXT:     SETGT_INT T6.X, T0.Y, literal.x,
-; CM-NEXT:     CNDE_INT T0.Y, T4.W, T2.X, PV.W,
-; CM-NEXT:     CNDE_INT * T1.Z, T2.Z, PV.Z, T1.Z,
-; CM-NEXT:    127(1.779649e-43), 0(0.000000e+00)
-; CM-NEXT:    ALU clause starting at 205:
-; CM-NEXT:     LSHL * T2.W, T5.Y, literal.x,
-; CM-NEXT:    23(3.222986e-44), 0(0.000000e+00)
-; CM-NEXT:     ADD_INT T2.X, PV.W, literal.x,
-; CM-NEXT:     CNDE_INT T0.Y, T1.Y, T1.Z, T0.Y,
-; CM-NEXT:     CNDE_INT * T1.Z, T6.X, T4.X, T1.X,
+; CM-NEXT:     SETGT_INT T8.X, T0.Z, literal.x,
+; CM-NEXT:     CNDE_INT T0.Y, T1.W, PV.W, T1.Y,
+; CM-NEXT:     CNDE_INT T0.Z, T4.Y, T7.X, PV.Z,
+; CM-NEXT:     LSHL * T0.W, PV.Y, literal.y,
+; CM-NEXT:    127(1.779649e-43), 23(3.222986e-44)
+; CM-NEXT:    ALU clause starting at 202:
+; CM-NEXT:     ADD_INT T7.X, T0.W, literal.x,
+; CM-NEXT:     CNDE_INT * T0.Y, T5.X, T0.Y, T0.Z,
 ; CM-NEXT:    1065353216(1.000000e+00), 0(0.000000e+00)
-; CM-NEXT:     CNDE_INT * T1.W, T1.W, T3.Y, T2.Y,
-; CM-NEXT:     CNDE_INT T1.X, T0.Z, PV.W, T0.W,
-; CM-NEXT:     LSHL T1.Y, T1.Z, literal.x, BS:VEC_120/SCL_212
-; CM-NEXT:     MUL_IEEE T0.Z, T0.Y, T2.X,
+; CM-NEXT:     CNDE_INT * T0.Z, T8.X, T0.X, T6.X,
+; CM-NEXT:     MUL_IEEE * T0.W, T4.X, literal.x,
+; CM-NEXT:    2130706432(1.701412e+38), 0(0.000000e+00)
+; CM-NEXT:     CNDE_INT T0.X, T2.W, T4.X, PV.W,
+; CM-NEXT:     LSHL T1.Y, T0.Z, literal.x,
+; CM-NEXT:     MUL_IEEE T0.Z, T0.Y, T7.X, BS:VEC_021/SCL_122
 ; CM-NEXT:     SETGT * T0.W, literal.y, KC0[4].X,
 ; CM-NEXT:    23(3.222986e-44), -1036817932(-4.485347e+01)
-; CM-NEXT:     CNDE T2.X, PV.W, PV.Z, 0.0,
+; CM-NEXT:     CNDE T4.X, PV.W, PV.Z, 0.0,
 ; CM-NEXT:     SETGT T0.Y, KC0[4].X, literal.x,
 ; CM-NEXT:     ADD_INT T0.Z, PV.Y, literal.y,
-; CM-NEXT:     CNDE_INT * T0.W, T6.X, PV.X, T5.X,
+; CM-NEXT:     CNDE_INT * T0.W, T8.X, T3.X, PV.X,
 ; CM-NEXT:    1109008539(3.853184e+01), 1065353216(1.000000e+00)
-; CM-NEXT:     SETGT T1.X, KC0[3].W, literal.x,
+; CM-NEXT:     SETGT T0.X, KC0[3].W, literal.x,
 ; CM-NEXT:     MUL_IEEE T1.Y, PV.W, PV.Z,
 ; CM-NEXT:     SETGT T0.Z, literal.y, KC0[3].Z,
 ; CM-NEXT:     CNDE * T0.W, PV.Y, PV.X, literal.z,
 ; CM-NEXT:    1109008539(3.853184e+01), -1036817932(-4.485347e+01)
 ; CM-NEXT:    2139095040(INF), 0(0.000000e+00)
-; CM-NEXT:     SETGT T2.X, literal.x, KC0[3].Y,
+; CM-NEXT:     SETGT T3.X, literal.x, KC0[3].Y,
 ; CM-NEXT:     CNDE T0.Y, PV.Z, PV.Y, 0.0,
-; CM-NEXT:     CNDE T0.Z, PV.X, T3.X, literal.y,
+; CM-NEXT:     CNDE T0.Z, PV.X, T2.X, literal.y,
 ; CM-NEXT:     SETGT * T1.W, KC0[3].Z, literal.z,
 ; CM-NEXT:    -1036817932(-4.485347e+01), 2139095040(INF)
 ; CM-NEXT:    1109008539(3.853184e+01), 0(0.000000e+00)
 ; CM-NEXT:     CNDE T0.Y, PV.W, PV.Y, literal.x,
-; CM-NEXT:     CNDE T1.Z, PV.X, T0.X, 0.0,
+; CM-NEXT:     CNDE T1.Z, PV.X, T1.X, 0.0,
 ; CM-NEXT:     SETGT * T1.W, KC0[3].Y, literal.y,
 ; CM-NEXT:    2139095040(INF), 1109008539(3.853184e+01)
 ; CM-NEXT:     CNDE * T0.X, PV.W, PV.Z, literal.x,
diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
index 1b0306559295..0492c5663e66 100644
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -1,8 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -earlycse-debug-hash -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-; GCN-LABEL: {{^}}v_sad_u32_pat1:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: v_sad_u32_pat1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_sad_u32 v2, s0, v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -16,9 +27,18 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b,
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_constant_pat1:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 20
 define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a) {
+; GCN-LABEL: v_sad_u32_constant_pat1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x5a
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_sad_u32 v2, s2, v0, 20
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, 90
   %t0 = select i1 %icmp0, i32 %a, i32 90
 
@@ -32,9 +52,19 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_pat2:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: v_sad_u32_pat2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_sad_u32 v2, s0, v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %b
   %sub1 = sub i32 %b, %a
@@ -46,12 +76,28 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b,
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_multi_use_sub_pat1:
-; GCN: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: s_min_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: v_sad_u32_multi_use_sub_pat1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_add_u32 s8, s8, s7
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_max_u32 s3, s0, s1
+; GCN-NEXT:    s_min_u32 s0, s0, s1
+; GCN-NEXT:    s_sub_i32 s0, s3, s0
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    s_add_i32 s0, s0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    buffer_store_dword v2, v0, s[8:11], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -66,9 +112,25 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_multi_use_add_pat1:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: v_sad_u32_multi_use_add_pat1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_add_u32 s8, s8, s7
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    v_mov_b32_e32 v3, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_sad_u32 v2, s0, v2, v3
+; GCN-NEXT:    buffer_store_dword v2, v0, s[8:11], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -82,9 +144,27 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_multi_use_max_pat1:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: v_sad_u32_multi_use_max_pat1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_add_u32 s8, s8, s7
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_max_u32 s3, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    v_sad_u32 v3, s0, v0, v1
+; GCN-NEXT:    buffer_store_dword v2, v0, s[8:11], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    flat_store_dword v[0:1], v3
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
   store volatile i32 %t0, ptr addrspace(5) undef
@@ -99,9 +179,27 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_multi_use_min_pat1:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: v_sad_u32_multi_use_min_pat1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_add_u32 s8, s8, s7
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_min_u32 s3, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    v_sad_u32 v3, s0, v0, v1
+; GCN-NEXT:    buffer_store_dword v2, v0, s[8:11], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    flat_store_dword v[0:1], v3
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -117,9 +215,27 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_multi_use_sub_pat2:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: v_sad_u32_multi_use_sub_pat2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_add_u32 s8, s8, s7
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_sub_i32 s3, s0, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    v_sad_u32 v3, s0, v0, v1
+; GCN-NEXT:    buffer_store_dword v2, v0, s[8:11], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    flat_store_dword v[0:1], v3
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %b
   store volatile i32 %sub0, ptr addrspace(5) undef
@@ -132,11 +248,29 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_multi_use_select_pat2:
-; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: s_cmp_gt_u32 s{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: v_sad_u32_multi_use_select_pat2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_add_u32 s8, s8, s7
+; GCN-NEXT:    s_addc_u32 s9, s9, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_sub_i32 s3, s0, s1
+; GCN-NEXT:    s_sub_i32 s6, s1, s0
+; GCN-NEXT:    s_cmp_gt_u32 s0, s1
+; GCN-NEXT:    s_cselect_b32 s0, s3, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    s_add_i32 s0, s0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    buffer_store_dword v2, v0, s[8:11], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %b
   %sub1 = sub i32 %b, %a
@@ -149,12 +283,29 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_vector_pat1:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; GCN-LABEL: v_sad_u32_vector_pat1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x4
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xc
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s15
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-NEXT:    v_sad_u32 v3, s11, v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_sad_u32 v2, s10, v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_sad_u32 v1, s9, v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    v_sad_u32 v0, s8, v0, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt <4 x i32> %a, %b
   %t0 = select <4 x i1> %icmp0, <4 x i32> %a, <4 x i32> %b
 
@@ -168,12 +319,29 @@ define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_vector_pat2:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+; GCN-LABEL: v_sad_u32_vector_pat2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x4
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xc
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s15
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    v_mov_b32_e32 v2, s14
+; GCN-NEXT:    v_sad_u32 v3, s11, v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_sad_u32 v2, s10, v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_sad_u32 v1, s9, v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-NEXT:    v_sad_u32 v0, s8, v0, v4
+; GCN-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt <4 x i32> %a, %b
   %sub0 = sub <4 x i32> %a, %b
   %sub1 = sub <4 x i32> %b, %a
@@ -185,10 +353,22 @@ define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_i16_pat1:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16 %b, i16 %c) {
-
+; GCN-LABEL: v_sad_u32_i16_pat1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s4, s6, 0xffff
+; GCN-NEXT:    s_lshr_b32 s0, s0, 16
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
+; GCN-NEXT:    v_sad_u32 v2, s4, v1, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-NEXT:    flat_store_short v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i16 %a, %b
   %t0 = select i1 %icmp0, i16 %a, i16 %b
 
@@ -202,9 +382,22 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_i16_pat2:
-; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) {
+; GCN-LABEL: v_sad_u32_i16_pat2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    flat_load_ushort v0, v[0:1] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    flat_load_ushort v1, v[0:1] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    flat_load_ushort v2, v[0:1] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_sad_u32 v2, v0, v1, v2
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_store_short v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %a = load volatile i16, ptr addrspace(1) undef
   %b = load volatile i16, ptr addrspace(1) undef
   %c = load volatile i16, ptr addrspace(1) undef
@@ -219,9 +412,22 @@ define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) {
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_i8_pat1:
-; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b, i8 %c) {
+; GCN-LABEL: v_sad_u32_i8_pat1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s3, s2, 0xff
+; GCN-NEXT:    s_bfe_u32 s4, s2, 0x80008
+; GCN-NEXT:    s_lshr_b32 s2, s2, 16
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    v_sad_u32 v2, s3, v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_store_byte v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i8 %a, %b
   %t0 = select i1 %icmp0, i8 %a, i8 %b
 
@@ -235,9 +441,22 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_i8_pat2:
-; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) {
+; GCN-LABEL: v_sad_u32_i8_pat2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    flat_load_ubyte v0, v[0:1] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    flat_load_ubyte v1, v[0:1] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    flat_load_ubyte v2, v[0:1] glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_sad_u32 v2, v0, v1, v2
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    flat_store_byte v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %a = load volatile i8, ptr addrspace(1) undef
   %b = load volatile i8, ptr addrspace(1) undef
   %c = load volatile i8, ptr addrspace(1) undef
@@ -252,15 +471,26 @@ define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) {
   ret void
 }
 
-; GCN-LABEL: {{^}}s_sad_u32_i8_pat2:
-; GCN: s_load_dword
-; GCN-DAG: s_bfe_u32
-; GCN-DAG: s_sub_i32
-; GCN-DAG: s_and_b32
-; GCN-DAG: s_sub_i32
-; GCN-DAG: s_lshr_b32
-; GCN: s_add_i32
 define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
+; GCN-LABEL: s_sad_u32_i8_pat2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s4, s2, 8
+; GCN-NEXT:    s_and_b32 s3, s2, 0xff
+; GCN-NEXT:    s_bfe_u32 s5, s2, 0x80008
+; GCN-NEXT:    s_lshr_b32 s6, s2, 16
+; GCN-NEXT:    s_sub_i32 s7, s2, s4
+; GCN-NEXT:    s_sub_i32 s2, s4, s2
+; GCN-NEXT:    s_cmp_gt_u32 s3, s5
+; GCN-NEXT:    s_cselect_b32 s2, s7, s2
+; GCN-NEXT:    s_add_i32 s2, s2, s6
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    flat_store_byte v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i8 %a, %b
   %sub0 = sub i8 %a, %b
   %sub1 = sub i8 %b, %a
@@ -272,12 +502,22 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_mismatched_operands_pat1:
-; GCN-DAG: s_cmp_le_u32 s{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+; GCN-LABEL: v_sad_u32_mismatched_operands_pat1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_max_u32 s6, s0, s1
+; GCN-NEXT:    s_cmp_le_u32 s0, s1
+; GCN-NEXT:    s_cselect_b32 s0, s0, s3
+; GCN-NEXT:    s_sub_i32 s0, s6, s0
+; GCN-NEXT:    s_add_i32 s0, s0, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -291,11 +531,22 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %
   ret void
 }
 
-; GCN-LABEL: {{^}}v_sad_u32_mismatched_operands_pat2:
-; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+; GCN-LABEL: v_sad_u32_mismatched_operands_pat2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_sub_i32 s3, s0, s3
+; GCN-NEXT:    s_sub_i32 s6, s1, s0
+; GCN-NEXT:    s_cmp_lt_u32 s1, s0
+; GCN-NEXT:    s_cselect_b32 s0, s3, s6
+; GCN-NEXT:    s_add_i32 s0, s0, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_endpgm
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %d
   %sub1 = sub i32 %b, %a
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index b1a82daa8e7d..b3f4790df4d4 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -795,17 +795,17 @@ define amdgpu_kernel void @shl_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     AND_INT T1.Y, T0.Z, literal.x,
-; EG-NEXT:     LSHR T1.Z, T0.Y, 1,
+; EG-NEXT:     LSHR T1.Y, T0.Y, 1,
+; EG-NEXT:     NOT_INT T1.Z, T0.Z,
 ; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.Y, T0.X, 1,
-; EG-NEXT:     NOT_INT * T1.W, T0.Z,
+; EG-NEXT:     AND_INT * T1.W, T0.Z, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T1.Z, PV.Z, PV.W, PS,
-; EG-NEXT:     LSHL T0.W, T0.X, PV.Y,
+; EG-NEXT:     LSHL T2.Z, T0.X, PS,
+; EG-NEXT:     BIT_ALIGN_INT T0.W, PV.Y, PV.W, PV.Z,
 ; EG-NEXT:     AND_INT * T1.W, T0.Z, literal.x,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
-; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
+; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
+; EG-NEXT:     CNDE_INT T0.X, T1.W, T2.Z, 0.0,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %b_ptr = getelementptr i64, ptr addrspace(1) %in, i64 1
@@ -858,8 +858,8 @@ define amdgpu_kernel void @shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 1 @6
-; EG-NEXT:    ALU 22, @11, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T0.X, 1
+; EG-NEXT:    ALU 23, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    Fetch clause starting at 6:
@@ -868,27 +868,28 @@ define amdgpu_kernel void @shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; EG-NEXT:    ALU clause starting at 10:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 11:
-; EG-NEXT:     AND_INT T1.Y, T1.Z, literal.x,
+; EG-NEXT:     AND_INT * T1.W, T1.Z, literal.x,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     LSHL T2.X, T0.Z, PV.W,
+; EG-NEXT:     AND_INT T1.Y, T1.Z, literal.x, BS:VEC_120/SCL_212
 ; EG-NEXT:     LSHR T2.Z, T0.W, 1,
-; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.W, T0.Z, 1,
+; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.W, T0.Z, 1, BS:VEC_102/SCL_221
 ; EG-NEXT:     NOT_INT * T1.W, T1.Z,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     BIT_ALIGN_INT T3.X, PV.Z, PV.W, PS,
+; EG-NEXT:     LSHR T2.Y, T0.Y, 1,
+; EG-NEXT:     NOT_INT T0.Z, T1.X,
+; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.Y, T0.X, 1,
+; EG-NEXT:     AND_INT * T1.W, T1.X, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T0.W, PV.Z, PV.W, PS,
-; EG-NEXT:     LSHL * T1.W, T0.Z, PV.Y,
-; EG-NEXT:     AND_INT T2.X, T1.Z, literal.x,
-; EG-NEXT:     AND_INT T1.Y, T1.X, literal.y,
-; EG-NEXT:     LSHR T0.Z, T0.Y, 1,
-; EG-NEXT:     BIT_ALIGN_INT T2.W, T0.Y, T0.X, 1,
-; EG-NEXT:     NOT_INT * T3.W, T1.X,
-; EG-NEXT:    32(4.484155e-44), 31(4.344025e-44)
-; EG-NEXT:     BIT_ALIGN_INT T0.Y, PV.Z, PV.W, PS,
-; EG-NEXT:     LSHL T0.Z, T0.X, PV.Y,
-; EG-NEXT:     AND_INT T2.W, T1.X, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     CNDE_INT * T3.W, PV.X, T0.W, T1.W,
+; EG-NEXT:     LSHL T0.Y, T0.X, PS, BS:VEC_120/SCL_212
+; EG-NEXT:     AND_INT T1.Z, T1.X, literal.x, BS:VEC_201
+; EG-NEXT:     BIT_ALIGN_INT T0.W, PV.Y, PV.W, PV.Z,
+; EG-NEXT:     CNDE_INT * T2.W, T1.Y, PV.X, T2.X,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT T3.Y, PV.W, PV.Y, PV.Z,
-; EG-NEXT:     CNDE_INT * T3.Z, T2.X, T1.W, 0.0,
-; EG-NEXT:     CNDE_INT T3.X, T2.W, T0.Z, 0.0,
+; EG-NEXT:     CNDE_INT T2.Y, PV.Z, PV.W, PV.Y,
+; EG-NEXT:     CNDE_INT * T2.Z, T1.Y, T2.X, 0.0,
+; EG-NEXT:     CNDE_INT T2.X, T1.Z, T0.Y, 0.0,
 ; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %in, i64 1
@@ -955,65 +956,66 @@ define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 3 @6
-; EG-NEXT:    ALU 47, @15, KC0[CB0:0-32], KC1[]
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
-; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T0.X, 1
+; EG-NEXT:    ALU 48, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T1.X, 1
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    Fetch clause starting at 6:
-; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 48, #1
-; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 0, #1
-; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 32, #1
-; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 32, #1
+; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 48, #1
+; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 16, #1
+; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
 ; EG-NEXT:    ALU clause starting at 14:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 15:
-; EG-NEXT:     AND_INT T4.Z, T1.Z, literal.x,
-; EG-NEXT:     LSHR T1.W, T0.W, 1,
-; EG-NEXT:     NOT_INT * T3.W, T1.Z,
+; EG-NEXT:     AND_INT * T1.W, T1.Z, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T4.X, T0.W, T0.Z, 1,
-; EG-NEXT:     AND_INT T1.Y, T3.Z, literal.x, BS:VEC_201
-; EG-NEXT:     LSHR T5.Z, T2.W, 1, BS:VEC_120/SCL_212
-; EG-NEXT:     BIT_ALIGN_INT T0.W, T2.W, T2.Z, 1, BS:VEC_102/SCL_221
-; EG-NEXT:     NOT_INT * T2.W, T3.Z,
-; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T3.Y, PV.Z, PV.W, PS,
-; EG-NEXT:     LSHL T2.Z, T2.Z, PV.Y,
-; EG-NEXT:     BIT_ALIGN_INT T0.W, T1.W, PV.X, T3.W,
-; EG-NEXT:     LSHL * T1.W, T0.Z, T4.Z,
+; EG-NEXT:     LSHL * T1.W, T0.Z, PV.W,
 ; EG-NEXT:     AND_INT T4.X, T1.Z, literal.x,
-; EG-NEXT:     AND_INT T1.Y, T1.X, literal.y,
-; EG-NEXT:     LSHR T0.Z, T0.Y, 1,
-; EG-NEXT:     BIT_ALIGN_INT T2.W, T0.Y, T0.X, 1,
-; EG-NEXT:     NOT_INT * T3.W, T1.X,
+; EG-NEXT:     LSHR T1.Y, T3.W, 1,
+; EG-NEXT:     NOT_INT T4.Z, T2.Z, BS:VEC_201
+; EG-NEXT:     BIT_ALIGN_INT T2.W, T3.W, T3.Z, 1,
+; EG-NEXT:     AND_INT * T3.W, T2.Z, literal.y,
 ; EG-NEXT:    32(4.484155e-44), 31(4.344025e-44)
-; EG-NEXT:     AND_INT T5.X, T3.Z, literal.x,
-; EG-NEXT:     BIT_ALIGN_INT T0.Y, PV.Z, PV.W, PS,
-; EG-NEXT:     LSHL T0.Z, T0.X, PV.Y,
-; EG-NEXT:     AND_INT T2.W, T1.X, literal.x, BS:VEC_120/SCL_212
-; EG-NEXT:     CNDE_INT * T4.W, PV.X, T0.W, T1.W,
+; EG-NEXT:     LSHL T5.X, T3.Z, PS,
+; EG-NEXT:     AND_INT T2.Y, T2.Z, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     BIT_ALIGN_INT T2.Z, PV.Y, PV.W, PV.Z,
+; EG-NEXT:     LSHR T2.W, T3.Y, 1,
+; EG-NEXT:     NOT_INT * T3.W, T2.X,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     BIT_ALIGN_INT T6.X, T3.Y, T3.X, 1,
+; EG-NEXT:     AND_INT T1.Y, T2.X, literal.x,
+; EG-NEXT:     LSHR T3.Z, T0.W, 1,
+; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.W, T0.Z, 1,
+; EG-NEXT:     NOT_INT * T4.W, T1.Z,
+; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
+; EG-NEXT:     BIT_ALIGN_INT T7.X, PV.Z, PV.W, PS,
+; EG-NEXT:     LSHL T1.Y, T3.X, PV.Y, BS:VEC_120/SCL_212
+; EG-NEXT:     AND_INT T0.Z, T2.X, literal.x, BS:VEC_201
+; EG-NEXT:     BIT_ALIGN_INT T0.W, T2.W, PV.X, T3.W,
+; EG-NEXT:     CNDE_INT * T3.W, T2.Y, T2.Z, T5.X,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     AND_INT T0.X, T3.X, literal.x,
-; EG-NEXT:     CNDE_INT T4.Y, PV.W, PV.Y, PV.Z,
-; EG-NEXT:     LSHR T1.Z, T2.Y, 1,
-; EG-NEXT:     BIT_ALIGN_INT T0.W, T2.Y, T2.X, 1,
-; EG-NEXT:     NOT_INT * T3.W, T3.X,
+; EG-NEXT:     LSHR T2.X, T0.Y, 1,
+; EG-NEXT:     CNDE_INT T3.Y, PV.Z, PV.W, PV.Y,
+; EG-NEXT:     NOT_INT T1.Z, T1.X,
+; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.Y, T0.X, 1,
+; EG-NEXT:     AND_INT * T2.W, T1.X, literal.x,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     BIT_ALIGN_INT T1.X, PV.Z, PV.W, PS,
-; EG-NEXT:     LSHL T0.Y, T2.X, PV.X,
-; EG-NEXT:     CNDE_INT T4.Z, T4.X, T1.W, 0.0, BS:VEC_120/SCL_212
-; EG-NEXT:     AND_INT * T0.W, T3.X, literal.x, BS:VEC_201
+; EG-NEXT:     LSHL T0.X, T0.X, PS,
+; EG-NEXT:     AND_INT T0.Y, T1.X, literal.x, BS:VEC_120/SCL_212
+; EG-NEXT:     CNDE_INT T3.Z, T2.Y, T5.X, 0.0, BS:VEC_021/SCL_122
+; EG-NEXT:     BIT_ALIGN_INT * T0.W, PV.X, PV.W, PV.Z,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT * T1.W, T5.X, T3.Y, T2.Z,
-; EG-NEXT:     CNDE_INT T4.X, T2.W, T0.Z, 0.0,
-; EG-NEXT:     CNDE_INT T1.Y, T0.W, T1.X, T0.Y, BS:VEC_120/SCL_212
-; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
+; EG-NEXT:     CNDE_INT * T2.W, T4.X, T7.X, T1.W,
+; EG-NEXT:     CNDE_INT T3.X, T0.Z, T1.Y, 0.0,
+; EG-NEXT:     CNDE_INT T2.Y, T0.Y, T0.W, T0.X,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
 ; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHR T0.X, PV.W, literal.x,
-; EG-NEXT:     CNDE_INT T1.Z, T5.X, T2.Z, 0.0,
-; EG-NEXT:     CNDE_INT * T1.X, T0.W, T0.Y, 0.0,
+; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
+; EG-NEXT:     CNDE_INT T2.Z, T4.X, T1.W, 0.0,
+; EG-NEXT:     CNDE_INT * T2.X, T0.Y, T0.X, 0.0,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
-; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %in, i64 1
   %a = load <4 x i64>, ptr addrspace(1) %in
@@ -1172,17 +1174,17 @@ define amdgpu_kernel void @s_shl_constant_i64(ptr addrspace(1) %out, i64 %a) {
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    ALU clause starting at 4:
-; EG-NEXT:     AND_INT T0.Z, KC0[2].W, literal.x,
-; EG-NEXT:     MOV T0.W, literal.y,
-; EG-NEXT:     NOT_INT * T1.W, KC0[2].W,
-; EG-NEXT:    31(4.344025e-44), -1(nan)
-; EG-NEXT:     BIT_ALIGN_INT T1.Z, literal.x, PV.W, PS,
-; EG-NEXT:     LSHL T0.W, literal.y, PV.Z,
+; EG-NEXT:     MOV T0.Z, literal.x,
+; EG-NEXT:     NOT_INT T0.W, KC0[2].W,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
+; EG-NEXT:    -1(nan), 31(4.344025e-44)
+; EG-NEXT:     LSHL T1.Z, literal.x, PS,
+; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.y, PV.Z, PV.W,
 ; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.z,
-; EG-NEXT:    32767(4.591635e-41), -1(nan)
+; EG-NEXT:    -1(nan), 32767(4.591635e-41)
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
-; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
+; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
+; EG-NEXT:     CNDE_INT T0.X, T1.W, T1.Z, 0.0,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %shl = shl i64 281474976710655, %a
@@ -1423,15 +1425,15 @@ define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    ALU clause starting at 4:
-; EG-NEXT:     NOT_INT T0.W, KC0[2].W,
-; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.x,
+; EG-NEXT:     AND_INT T0.W, KC0[2].W, literal.x,
+; EG-NEXT:     NOT_INT * T1.W, KC0[2].W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHL T0.Z, literal.x, PS,
-; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, literal.y, PV.W,
-; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
-; EG-NEXT:    64(8.968310e-44), 32(4.484155e-44)
-; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
-; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.Z, 0.0,
+; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS,
+; EG-NEXT:     AND_INT T1.W, KC0[2].W, literal.x,
+; EG-NEXT:     LSHL * T0.W, literal.y, PV.W,
+; EG-NEXT:    32(4.484155e-44), 64(8.968310e-44)
+; EG-NEXT:     CNDE_INT * T0.Y, PV.W, PV.Z, PS,
+; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %shl = shl i64 64, %a
@@ -1903,16 +1905,16 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, p
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    ALU clause starting at 4:
-; EG-NEXT:     NOT_INT T0.W, KC0[2].W,
-; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.x,
+; EG-NEXT:     AND_INT T0.W, KC0[2].W, literal.x,
+; EG-NEXT:     NOT_INT * T1.W, KC0[2].W,
 ; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
-; EG-NEXT:     LSHL T0.Z, literal.x, PS,
-; EG-NEXT:     BIT_ALIGN_INT T0.W, 0.0, literal.y, PV.W,
-; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.z,
-; EG-NEXT:    1082130432(4.000000e+00), 541065216(1.626303e-19)
-; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
-; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.Z, 0.0,
+; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS,
+; EG-NEXT:     AND_INT T1.W, KC0[2].W, literal.y,
+; EG-NEXT:     LSHL * T0.W, literal.z, PV.W,
+; EG-NEXT:    541065216(1.626303e-19), 32(4.484155e-44)
+; EG-NEXT:    1082130432(4.000000e+00), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT * T0.Y, PV.W, PV.Z, PS,
+; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %shl = shl i64 1082130432, %a
@@ -1959,17 +1961,17 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %ou
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
 ; EG-NEXT:    ALU clause starting at 4:
-; EG-NEXT:     AND_INT T0.Z, KC0[2].W, literal.x,
-; EG-NEXT:     MOV T0.W, literal.y,
-; EG-NEXT:     NOT_INT * T1.W, KC0[2].W,
-; EG-NEXT:    31(4.344025e-44), -532676608(-5.534023e+19)
-; EG-NEXT:     BIT_ALIGN_INT T1.Z, literal.x, PV.W, PS,
-; EG-NEXT:     LSHL T0.W, literal.y, PV.Z,
+; EG-NEXT:     MOV T0.Z, literal.x,
+; EG-NEXT:     NOT_INT T0.W, KC0[2].W,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
+; EG-NEXT:    -532676608(-5.534023e+19), 31(4.344025e-44)
+; EG-NEXT:     LSHL T1.Z, literal.x, PS,
+; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.y, PV.Z, PV.W,
 ; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.z,
-; EG-NEXT:    2147483647(nan), -1065353216(-4.000000e+00)
+; EG-NEXT:    -1065353216(-4.000000e+00), 2147483647(nan)
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
-; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
-; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
+; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
+; EG-NEXT:     CNDE_INT T0.X, T1.W, T1.Z, 0.0,
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
   %shl = shl i64 -1065353216, %a
diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index 152eba5dec94..5a241f85b2e2 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -506,8 +506,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_m2
-; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 -2)
-; GCN-PRELINK: %__rootn2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp)
+; GCN: [[SQRT:%.+]] = tail call fast float @llvm.sqrt.f32(float %tmp)
+; GCN-NEXT: fdiv fast float 1.000000e+00, [[SQRT]]
 define amdgpu_kernel void @test_rootn_m2(ptr addrspace(1) nocapture %a) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index dcc5fbd142c4..7dce633e9186 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -264,6 +264,142 @@ ret:
   ret void
 }
 
+define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; NOHSA-TRAP-GFX900-LABEL: trap_with_use_after:
+; NOHSA-TRAP-GFX900:       ; %bb.0:
+; NOHSA-TRAP-GFX900-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; NOHSA-TRAP-GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; NOHSA-TRAP-GFX900-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; NOHSA-TRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-NEXT:    s_cbranch_execnz .LBB2_2
+; NOHSA-TRAP-GFX900-NEXT:  ; %bb.1:
+; NOHSA-TRAP-GFX900-NEXT:    global_store_dword v0, v1, s[2:3]
+; NOHSA-TRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
+; NOHSA-TRAP-GFX900-NEXT:  .LBB2_2:
+; NOHSA-TRAP-GFX900-NEXT:    s_endpgm
+;
+; HSA-TRAP-GFX803-LABEL: trap_with_use_after:
+; HSA-TRAP-GFX803:       ; %bb.0:
+; HSA-TRAP-GFX803-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; HSA-TRAP-GFX803-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
+; HSA-TRAP-GFX803-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s4
+; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s5
+; HSA-TRAP-GFX803-NEXT:    flat_load_dword v2, v[0:1] glc
+; HSA-TRAP-GFX803-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s6
+; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s7
+; HSA-TRAP-GFX803-NEXT:    s_trap 2
+; HSA-TRAP-GFX803-NEXT:    flat_store_dword v[0:1], v2
+; HSA-TRAP-GFX803-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX803-NEXT:    s_endpgm
+;
+; HSA-TRAP-GFX900-LABEL: trap_with_use_after:
+; HSA-TRAP-GFX900:       ; %bb.0:
+; HSA-TRAP-GFX900-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; HSA-TRAP-GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX900-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; HSA-TRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-NEXT:    s_trap 2
+; HSA-TRAP-GFX900-NEXT:    global_store_dword v0, v1, s[2:3]
+; HSA-TRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX900-NEXT:    s_endpgm
+;
+; HSA-NOTRAP-GFX900-LABEL: trap_with_use_after:
+; HSA-NOTRAP-GFX900:       ; %bb.0:
+; HSA-NOTRAP-GFX900-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; HSA-NOTRAP-GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-NOTRAP-GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-NOTRAP-GFX900-NEXT:    global_load_dword v1, v0, s[0:1] glc
+; HSA-NOTRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-NEXT:    s_cbranch_execnz .LBB2_2
+; HSA-NOTRAP-GFX900-NEXT:  ; %bb.1:
+; HSA-NOTRAP-GFX900-NEXT:    global_store_dword v0, v1, s[2:3]
+; HSA-NOTRAP-GFX900-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NOTRAP-GFX900-NEXT:  .LBB2_2:
+; HSA-NOTRAP-GFX900-NEXT:    s_endpgm
+;
+; HSA-TRAP-GFX1100-LABEL: trap_with_use_after:
+; HSA-TRAP-GFX1100:       ; %bb.0:
+; HSA-TRAP-GFX1100-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; HSA-TRAP-GFX1100-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-NEXT:    global_load_b32 v1, v0, s[0:1] glc dlc
+; HSA-TRAP-GFX1100-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX1100-NEXT:    s_cbranch_execnz .LBB2_2
+; HSA-TRAP-GFX1100-NEXT:  ; %bb.1:
+; HSA-TRAP-GFX1100-NEXT:    global_store_b32 v0, v1, s[2:3] dlc
+; HSA-TRAP-GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
+; HSA-TRAP-GFX1100-NEXT:    s_nop 0
+; HSA-TRAP-GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; HSA-TRAP-GFX1100-NEXT:    s_endpgm
+; HSA-TRAP-GFX1100-NEXT:  .LBB2_2:
+; HSA-TRAP-GFX1100-NEXT:    s_trap 2
+; HSA-TRAP-GFX1100-NEXT:    s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
+; HSA-TRAP-GFX1100-NEXT:    s_mov_b32 ttmp2, m0
+; HSA-TRAP-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-NEXT:    s_and_b32 s0, s0, 0x3ff
+; HSA-TRAP-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; HSA-TRAP-GFX1100-NEXT:    s_bitset1_b32 s0, 10
+; HSA-TRAP-GFX1100-NEXT:    s_mov_b32 m0, s0
+; HSA-TRAP-GFX1100-NEXT:    s_sendmsg sendmsg(MSG_INTERRUPT)
+; HSA-TRAP-GFX1100-NEXT:    s_mov_b32 m0, ttmp2
+; HSA-TRAP-GFX1100-NEXT:  .LBB2_3: ; =>This Inner Loop Header: Depth=1
+; HSA-TRAP-GFX1100-NEXT:    s_sethalt 5
+; HSA-TRAP-GFX1100-NEXT:    s_branch .LBB2_3
+;
+; HSA-TRAP-GFX1100-O0-LABEL: trap_with_use_after:
+; HSA-TRAP-GFX1100-O0:       ; %bb.0:
+; HSA-TRAP-GFX1100-O0-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
+; HSA-TRAP-GFX1100-O0-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX1100-O0-NEXT:    scratch_store_b32 off, v0, off offset:8 ; 4-byte Folded Spill
+; HSA-TRAP-GFX1100-O0-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; HSA-TRAP-GFX1100-O0-NEXT:    s_load_b64 s[2:3], s[4:5], 0x8
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT:    v_writelane_b32 v1, s2, 0
+; HSA-TRAP-GFX1100-O0-NEXT:    v_writelane_b32 v1, s3, 1
+; HSA-TRAP-GFX1100-O0-NEXT:    s_or_saveexec_b32 s6, -1
+; HSA-TRAP-GFX1100-O0-NEXT:    scratch_store_b32 off, v1, off offset:4 ; 4-byte Folded Spill
+; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 exec_lo, s6
+; HSA-TRAP-GFX1100-O0-NEXT:    global_load_b32 v0, v0, s[0:1] glc dlc
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT:    scratch_store_b32 off, v0, off ; 4-byte Folded Spill
+; HSA-TRAP-GFX1100-O0-NEXT:    s_cbranch_execnz .LBB2_2
+; HSA-TRAP-GFX1100-O0-NEXT:  ; %bb.1:
+; HSA-TRAP-GFX1100-O0-NEXT:    s_or_saveexec_b32 s6, -1
+; HSA-TRAP-GFX1100-O0-NEXT:    scratch_load_b32 v0, off, off offset:4 ; 4-byte Folded Reload
+; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 exec_lo, s6
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT:    v_readlane_b32 s0, v0, 0
+; HSA-TRAP-GFX1100-O0-NEXT:    v_readlane_b32 s1, v0, 1
+; HSA-TRAP-GFX1100-O0-NEXT:    scratch_load_b32 v1, off, off offset:8 ; 4-byte Folded Reload
+; HSA-TRAP-GFX1100-O0-NEXT:    scratch_load_b32 v2, off, off ; 4-byte Folded Reload
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt vmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT:    global_store_b32 v1, v2, s[0:1] dlc
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt_vscnt null, 0x0
+; HSA-TRAP-GFX1100-O0-NEXT:    ; kill: killed $vgpr0
+; HSA-TRAP-GFX1100-O0-NEXT:    s_endpgm
+; HSA-TRAP-GFX1100-O0-NEXT:  .LBB2_2:
+; HSA-TRAP-GFX1100-O0-NEXT:    s_trap 2
+; HSA-TRAP-GFX1100-O0-NEXT:    s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
+; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 ttmp2, m0
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT:    s_and_b32 s0, s0, 0x3ff
+; HSA-TRAP-GFX1100-O0-NEXT:    s_or_b32 s0, s0, 0x400
+; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 m0, s0
+; HSA-TRAP-GFX1100-O0-NEXT:    s_sendmsg sendmsg(MSG_INTERRUPT)
+; HSA-TRAP-GFX1100-O0-NEXT:    s_mov_b32 m0, ttmp2
+; HSA-TRAP-GFX1100-O0-NEXT:  .LBB2_3: ; =>This Inner Loop Header: Depth=1
+; HSA-TRAP-GFX1100-O0-NEXT:    s_sethalt 5
+; HSA-TRAP-GFX1100-O0-NEXT:    s_branch .LBB2_3
+  %tmp = load volatile i32, ptr addrspace(1) %arg0
+  call void @llvm.trap()
+  store volatile i32 %tmp, ptr addrspace(1) %arg1
+  ret void
+}
+
 define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) {
 ; NOHSA-TRAP-GFX900-LABEL: debugtrap:
 ; NOHSA-TRAP-GFX900:       ; %bb.0:
@@ -334,6 +470,20 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0)
 ; HSA-TRAP-GFX1100-NEXT:    s_nop 0
 ; HSA-TRAP-GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; HSA-TRAP-GFX1100-NEXT:    s_endpgm
+;
+; HSA-TRAP-GFX1100-O0-LABEL: debugtrap:
+; HSA-TRAP-GFX1100-O0:       ; %bb.0:
+; HSA-TRAP-GFX1100-O0-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
+; HSA-TRAP-GFX1100-O0-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-TRAP-GFX1100-O0-NEXT:    v_mov_b32_e32 v1, 1
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-TRAP-GFX1100-O0-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt_vscnt null, 0x0
+; HSA-TRAP-GFX1100-O0-NEXT:    s_trap 3
+; HSA-TRAP-GFX1100-O0-NEXT:    v_mov_b32_e32 v1, 2
+; HSA-TRAP-GFX1100-O0-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
+; HSA-TRAP-GFX1100-O0-NEXT:    s_waitcnt_vscnt null, 0x0
+; HSA-TRAP-GFX1100-O0-NEXT:    s_endpgm
   store volatile i32 1, ptr addrspace(1) %arg0
   call void @llvm.debugtrap()
   store volatile i32 2, ptr addrspace(1) %arg0
diff --git a/llvm/test/CodeGen/Mips/mipsr6-minmaxnum.ll b/llvm/test/CodeGen/Mips/mipsr6-minmaxnum.ll
index e14e89916e6d..2a0ad07474c0 100644
--- a/llvm/test/CodeGen/Mips/mipsr6-minmaxnum.ll
+++ b/llvm/test/CodeGen/Mips/mipsr6-minmaxnum.ll
@@ -6,13 +6,17 @@
 define float @mins(float %x, float %y) {
 ; MIPS32R6EL-LABEL:	mins
 ; MIPS32R6EL:		# %bb.0:
+; MIPS32R6EL-NEXT:	min.s	$f0, $f14, $f14
+; MIPS32R6EL-NEXT:	min.s	$f1, $f12, $f12
 ; MIPS32R6EL-NEXT:	jr	$ra
-; MIPS32R6EL-NEXT:	min.s	$f0, $f12, $f14
+; MIPS32R6EL-NEXT:	min.s	$f0, $f1, $f0
 ;
 ; MIPS64R6EL-LABEL:	mins
 ; MIPS64R6EL:		# %bb.0:
+; MIPS64R6EL-NEXT:	min.s	$f0, $f13, $f13
+; MIPS64R6EL-NEXT:	min.s	$f1, $f12, $f12
 ; MIPS64R6EL-NEXT:	jr	$ra
-; MIPS64R6EL-NEXT:	min.s	$f0, $f12, $f13
+; MIPS64R6EL-NEXT:	min.s	$f0, $f1, $f0
 
   %r = tail call float @llvm.minnum.f32(float %x, float %y)
   ret float %r
@@ -21,13 +25,17 @@ define float @mins(float %x, float %y) {
 define float @maxs(float %x, float %y) {
 ; MIPS32R6EL-LABEL:	maxs
 ; MIPS32R6EL:		# %bb.0:
+; MIPS32R6EL-NEXT:	min.s	$f0, $f14, $f14
+; MIPS32R6EL-NEXT:	min.s	$f1, $f12, $f12
 ; MIPS32R6EL-NEXT:	jr	$ra
-; MIPS32R6EL-NEXT:	max.s	$f0, $f12, $f14
+; MIPS32R6EL-NEXT:	max.s	$f0, $f1, $f0
 ;
 ; MIPS64R6EL-LABEL:	maxs
 ; MIPS64R6EL:		# %bb.0:
+; MIPS64R6EL-NEXT:	min.s	$f0, $f13, $f13
+; MIPS64R6EL-NEXT:	min.s	$f1, $f12, $f12
 ; MIPS64R6EL-NEXT:	jr	$ra
-; MIPS64R6EL-NEXT:	max.s	$f0, $f12, $f13
+; MIPS64R6EL-NEXT:	max.s	$f0, $f1, $f0
 
   %r = tail call float @llvm.maxnum.f32(float %x, float %y)
   ret float %r
@@ -36,13 +44,17 @@ define float @maxs(float %x, float %y) {
 define double @mind(double %x, double %y) {
 ; MIPS32R6EL-LABEL:	mind
 ; MIPS32R6EL:		# %bb.0:
+; MIPS32R6EL-NEXT:	min.d	$f0, $f14, $f14
+; MIPS32R6EL-NEXT:	min.d	$f1, $f12, $f12
 ; MIPS32R6EL-NEXT:	jr	$ra
-; MIPS32R6EL-NEXT:	min.d	$f0, $f12, $f14
+; MIPS32R6EL-NEXT:	min.d	$f0, $f1, $f0
 ;
 ; MIPS64R6EL-LABEL:	mind
 ; MIPS64R6EL:		# %bb.0:
+; MIPS64R6EL-NEXT:	min.d	$f0, $f13, $f13
+; MIPS64R6EL-NEXT:	min.d	$f1, $f12, $f12
 ; MIPS64R6EL-NEXT:	jr	$ra
-; MIPS64R6EL-NEXT:	min.d	$f0, $f12, $f13
+; MIPS64R6EL-NEXT:	min.d	$f0, $f1, $f0
 
   %r = tail call double @llvm.minnum.f64(double %x, double %y)
   ret double %r
@@ -51,13 +63,17 @@ define double @mind(double %x, double %y) {
 define double @maxd(double %x, double %y) {
 ; MIPS32R6EL-LABEL:	maxd
 ; MIPS32R6EL:		# %bb.0:
+; MIPS32R6EL-NEXT:	min.d	$f0, $f14, $f14
+; MIPS32R6EL-NEXT:	min.d	$f1, $f12, $f12
 ; MIPS32R6EL-NEXT:	jr	$ra
-; MIPS32R6EL-NEXT:	max.d	$f0, $f12, $f14
+; MIPS32R6EL-NEXT:	max.d	$f0, $f1, $f0
 ;
 ; MIPS64R6EL-LABEL:	maxd
 ; MIPS64R6EL:		# %bb.0:
+; MIPS64R6EL-NEXT:	min.d	$f0, $f13, $f13
+; MIPS64R6EL-NEXT:	min.d	$f1, $f12, $f12
 ; MIPS64R6EL-NEXT:	jr	$ra
-; MIPS64R6EL-NEXT:	max.d	$f0, $f12, $f13
+; MIPS64R6EL-NEXT:	max.d	$f0, $f1, $f0
 
   %r = tail call double @llvm.maxnum.f64(double %x, double %y)
   ret double %r
diff --git a/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll b/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll
index fe68bee408fc..42b0f69181ab 100644
--- a/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll
+++ b/llvm/test/CodeGen/Mips/msa/f16-llvm-ir.ll
@@ -2466,13 +2466,14 @@ define void @fminnum(float %b) {
 ; MIPSR6-O32-NEXT:    lui $2, %hi(_gp_disp)
 ; MIPSR6-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
 ; MIPSR6-O32-NEXT:    addu $1, $2, $25
+; MIPSR6-O32-NEXT:    min.s $f0, $f12, $f12
 ; MIPSR6-O32-NEXT:    lw $1, %got(g)($1)
 ; MIPSR6-O32-NEXT:    lh $2, 0($1)
-; MIPSR6-O32-NEXT:    fill.h $w0, $2
-; MIPSR6-O32-NEXT:    fexupr.w $w0, $w0
-; MIPSR6-O32-NEXT:    copy_s.w $2, $w0[0]
-; MIPSR6-O32-NEXT:    mtc1 $2, $f0
-; MIPSR6-O32-NEXT:    min.s $f0, $f0, $f12
+; MIPSR6-O32-NEXT:    fill.h $w1, $2
+; MIPSR6-O32-NEXT:    fexupr.w $w1, $w1
+; MIPSR6-O32-NEXT:    copy_s.w $2, $w1[0]
+; MIPSR6-O32-NEXT:    mtc1 $2, $f1
+; MIPSR6-O32-NEXT:    min.s $f0, $f1, $f0
 ; MIPSR6-O32-NEXT:    mfc1 $2, $f0
 ; MIPSR6-O32-NEXT:    fill.w $w0, $2
 ; MIPSR6-O32-NEXT:    fexdo.h $w0, $w0, $w0
@@ -2485,13 +2486,14 @@ define void @fminnum(float %b) {
 ; MIPSR6-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fminnum)))
 ; MIPSR6-N32-NEXT:    addu $1, $1, $25
 ; MIPSR6-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fminnum)))
+; MIPSR6-N32-NEXT:    min.s $f0, $f12, $f12
 ; MIPSR6-N32-NEXT:    lw $1, %got_disp(g)($1)
 ; MIPSR6-N32-NEXT:    lh $2, 0($1)
-; MIPSR6-N32-NEXT:    fill.h $w0, $2
-; MIPSR6-N32-NEXT:    fexupr.w $w0, $w0
-; MIPSR6-N32-NEXT:    copy_s.w $2, $w0[0]
-; MIPSR6-N32-NEXT:    mtc1 $2, $f0
-; MIPSR6-N32-NEXT:    min.s $f0, $f0, $f12
+; MIPSR6-N32-NEXT:    fill.h $w1, $2
+; MIPSR6-N32-NEXT:    fexupr.w $w1, $w1
+; MIPSR6-N32-NEXT:    copy_s.w $2, $w1[0]
+; MIPSR6-N32-NEXT:    mtc1 $2, $f1
+; MIPSR6-N32-NEXT:    min.s $f0, $f1, $f0
 ; MIPSR6-N32-NEXT:    mfc1 $2, $f0
 ; MIPSR6-N32-NEXT:    fill.w $w0, $2
 ; MIPSR6-N32-NEXT:    fexdo.h $w0, $w0, $w0
@@ -2504,20 +2506,20 @@ define void @fminnum(float %b) {
 ; MIPSR6-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fminnum)))
 ; MIPSR6-N64-NEXT:    daddu $1, $1, $25
 ; MIPSR6-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fminnum)))
+; MIPSR6-N64-NEXT:    min.s $f0, $f12, $f12
 ; MIPSR6-N64-NEXT:    ld $1, %got_disp(g)($1)
 ; MIPSR6-N64-NEXT:    lh $2, 0($1)
-; MIPSR6-N64-NEXT:    fill.h $w0, $2
-; MIPSR6-N64-NEXT:    fexupr.w $w0, $w0
-; MIPSR6-N64-NEXT:    copy_s.w $2, $w0[0]
-; MIPSR6-N64-NEXT:    mtc1 $2, $f0
-; MIPSR6-N64-NEXT:    min.s $f0, $f0, $f12
+; MIPSR6-N64-NEXT:    fill.h $w1, $2
+; MIPSR6-N64-NEXT:    fexupr.w $w1, $w1
+; MIPSR6-N64-NEXT:    copy_s.w $2, $w1[0]
+; MIPSR6-N64-NEXT:    mtc1 $2, $f1
+; MIPSR6-N64-NEXT:    min.s $f0, $f1, $f0
 ; MIPSR6-N64-NEXT:    mfc1 $2, $f0
 ; MIPSR6-N64-NEXT:    fill.w $w0, $2
 ; MIPSR6-N64-NEXT:    fexdo.h $w0, $w0, $w0
 ; MIPSR6-N64-NEXT:    copy_u.h $2, $w0[0]
 ; MIPSR6-N64-NEXT:    jr $ra
 ; MIPSR6-N64-NEXT:    sh $2, 0($1)
-;
 entry:
   %0 = load i16, ptr @g, align 2
   %1 = call float @llvm.convert.from.fp16.f32(i16 %0)
@@ -2632,17 +2634,18 @@ define void @fmaxnum(float %b) {
 ; MIPS64R5-N64-NEXT:    daddiu $sp, $sp, 32
 ;
 ; MIPSR6-O32-LABEL: fmaxnum:
-; MIPSR6-O32:       # %bb.0:
+; MIPSR6-O32:       # %bb.0: # %entry
 ; MIPSR6-O32-NEXT:    lui $2, %hi(_gp_disp)
 ; MIPSR6-O32-NEXT:    addiu $2, $2, %lo(_gp_disp)
 ; MIPSR6-O32-NEXT:    addu $1, $2, $25
+; MIPSR6-O32-NEXT:    min.s $f0, $f12, $f12
 ; MIPSR6-O32-NEXT:    lw $1, %got(g)($1)
 ; MIPSR6-O32-NEXT:    lh $2, 0($1)
-; MIPSR6-O32-NEXT:    fill.h $w0, $2
-; MIPSR6-O32-NEXT:    fexupr.w $w0, $w0
-; MIPSR6-O32-NEXT:    copy_s.w $2, $w0[0]
-; MIPSR6-O32-NEXT:    mtc1 $2, $f0
-; MIPSR6-O32-NEXT:    max.s $f0, $f0, $f12
+; MIPSR6-O32-NEXT:    fill.h $w1, $2
+; MIPSR6-O32-NEXT:    fexupr.w $w1, $w1
+; MIPSR6-O32-NEXT:    copy_s.w $2, $w1[0]
+; MIPSR6-O32-NEXT:    mtc1 $2, $f1
+; MIPSR6-O32-NEXT:    max.s $f0, $f1, $f0
 ; MIPSR6-O32-NEXT:    mfc1 $2, $f0
 ; MIPSR6-O32-NEXT:    fill.w $w0, $2
 ; MIPSR6-O32-NEXT:    fexdo.h $w0, $w0, $w0
@@ -2651,17 +2654,18 @@ define void @fmaxnum(float %b) {
 ; MIPSR6-O32-NEXT:    sh $2, 0($1)
 ;
 ; MIPSR6-N32-LABEL: fmaxnum:
-; MIPSR6-N32:       # %bb.0:
+; MIPSR6-N32:       # %bb.0: # %entry
 ; MIPSR6-N32-NEXT:    lui $1, %hi(%neg(%gp_rel(fmaxnum)))
 ; MIPSR6-N32-NEXT:    addu $1, $1, $25
 ; MIPSR6-N32-NEXT:    addiu $1, $1, %lo(%neg(%gp_rel(fmaxnum)))
+; MIPSR6-N32-NEXT:    min.s $f0, $f12, $f12
 ; MIPSR6-N32-NEXT:    lw $1, %got_disp(g)($1)
 ; MIPSR6-N32-NEXT:    lh $2, 0($1)
-; MIPSR6-N32-NEXT:    fill.h $w0, $2
-; MIPSR6-N32-NEXT:    fexupr.w $w0, $w0
-; MIPSR6-N32-NEXT:    copy_s.w $2, $w0[0]
-; MIPSR6-N32-NEXT:    mtc1 $2, $f0
-; MIPSR6-N32-NEXT:    max.s $f0, $f0, $f12
+; MIPSR6-N32-NEXT:    fill.h $w1, $2
+; MIPSR6-N32-NEXT:    fexupr.w $w1, $w1
+; MIPSR6-N32-NEXT:    copy_s.w $2, $w1[0]
+; MIPSR6-N32-NEXT:    mtc1 $2, $f1
+; MIPSR6-N32-NEXT:    max.s $f0, $f1, $f0
 ; MIPSR6-N32-NEXT:    mfc1 $2, $f0
 ; MIPSR6-N32-NEXT:    fill.w $w0, $2
 ; MIPSR6-N32-NEXT:    fexdo.h $w0, $w0, $w0
@@ -2670,17 +2674,18 @@ define void @fmaxnum(float %b) {
 ; MIPSR6-N32-NEXT:    sh $2, 0($1)
 ;
 ; MIPSR6-N64-LABEL: fmaxnum:
-; MIPSR6-N64:       # %bb.0:
+; MIPSR6-N64:       # %bb.0: # %entry
 ; MIPSR6-N64-NEXT:    lui $1, %hi(%neg(%gp_rel(fmaxnum)))
 ; MIPSR6-N64-NEXT:    daddu $1, $1, $25
 ; MIPSR6-N64-NEXT:    daddiu $1, $1, %lo(%neg(%gp_rel(fmaxnum)))
+; MIPSR6-N64-NEXT:    min.s $f0, $f12, $f12
 ; MIPSR6-N64-NEXT:    ld $1, %got_disp(g)($1)
 ; MIPSR6-N64-NEXT:    lh $2, 0($1)
-; MIPSR6-N64-NEXT:    fill.h $w0, $2
-; MIPSR6-N64-NEXT:    fexupr.w $w0, $w0
-; MIPSR6-N64-NEXT:    copy_s.w $2, $w0[0]
-; MIPSR6-N64-NEXT:    mtc1 $2, $f0
-; MIPSR6-N64-NEXT:    max.s $f0, $f0, $f12
+; MIPSR6-N64-NEXT:    fill.h $w1, $2
+; MIPSR6-N64-NEXT:    fexupr.w $w1, $w1
+; MIPSR6-N64-NEXT:    copy_s.w $2, $w1[0]
+; MIPSR6-N64-NEXT:    mtc1 $2, $f1
+; MIPSR6-N64-NEXT:    max.s $f0, $f1, $f0
 ; MIPSR6-N64-NEXT:    mfc1 $2, $f0
 ; MIPSR6-N64-NEXT:    fill.w $w0, $2
 ; MIPSR6-N64-NEXT:    fexdo.h $w0, $w0, $w0
diff --git a/llvm/test/CodeGen/PowerPC/aix-tocdata-fastisel.ll b/llvm/test/CodeGen/PowerPC/aix-tocdata-fastisel.ll
index 5a7fcd1d0ddd..65338919f631 100644
--- a/llvm/test/CodeGen/PowerPC/aix-tocdata-fastisel.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-tocdata-fastisel.ll
@@ -1,9 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s -mtriple=powerpc64-ibm-aix-xcoff -fast-isel -verify-machineinstrs \
 ; RUN:   -code-model=small | FileCheck %s --check-prefix=SMALL
-
-;; FIXME: when toc data for 64 big large code model is supported,
-;; add a run line for large code model too.
+; RUN: llc < %s -mtriple=powerpc64-ibm-aix-xcoff -fast-isel -verify-machineinstrs \
+; RUN:   -code-model=large | FileCheck %s --check-prefix=LARGE
 
 @a = global i32 0, align 4 #0
 
@@ -11,9 +10,15 @@ define signext i32 @foo() #1 {
 ; SMALL-LABEL: foo:
 ; SMALL:       # %bb.0: # %entry
 ; SMALL-NEXT:    la 3, a[TD](2)
-; SMALL-NEXT:    lwz 3, 0(3)
-; SMALL-NEXT:    extsw 3, 3
+; SMALL-NEXT:    lwa 3, 0(3)
 ; SMALL-NEXT:    blr
+;
+; LARGE-LABEL: foo:
+; LARGE:       # %bb.0: # %entry
+; LARGE-NEXT:    addis 3, a[TD]@u(2)
+; LARGE-NEXT:    la 3, a[TD]@l(3)
+; LARGE-NEXT:    lwa 3, 0(3)
+; LARGE-NEXT:    blr
 entry:
   %0 = load i32, ptr @a, align 4
   ret i32 %0
diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-le.ll b/llvm/test/CodeGen/PowerPC/ctrloop-le.ll
index 599e540e898a..08ecd8970d83 100644
--- a/llvm/test/CodeGen/PowerPC/ctrloop-le.ll
+++ b/llvm/test/CodeGen/PowerPC/ctrloop-le.ll
@@ -293,8 +293,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos1_rr_sle
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos1_rr_sle(ptr nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -323,8 +322,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos2_rr_sle
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos2_rr_sle(ptr nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -353,8 +351,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos4_rr_sle
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos4_rr_sle(ptr nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -383,8 +380,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos8_rr_sle
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos8_rr_sle(ptr nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -413,8 +409,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos16_rr_sle
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos16_rr_sle(ptr nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
diff --git a/llvm/test/CodeGen/PowerPC/toc-data-no-data-sections.ll b/llvm/test/CodeGen/PowerPC/toc-data-no-data-sections.ll
new file mode 100644
index 000000000000..77851fb83025
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/toc-data-no-data-sections.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -data-sections=false -verify-machineinstrs < %s | FileCheck %s
+
+@a1 = global i32 0, align 4 #0
+
+define void @foo() {
+entry:
+  store i32 1, ptr @a1, align 4
+  ret void
+}
+
+attributes #0 = { "toc-data" }
+
+; CHECK: .toc
+; CHECK-NEXT: .csect a1[TD],2
+; CHECK-NEXT: .globl  a1[TD]
+; CHECK-NEXT: .align  2
+; CHECK-NOT: a1[TD]:
+; CHECK-NEXT: .vbyte  4, 0
diff --git a/llvm/test/CodeGen/PowerPC/toc-data.ll b/llvm/test/CodeGen/PowerPC/toc-data.ll
index 7f7afe76cfcd..12286657488d 100644
--- a/llvm/test/CodeGen/PowerPC/toc-data.ll
+++ b/llvm/test/CodeGen/PowerPC/toc-data.ll
@@ -16,6 +16,10 @@
 ; RUN:     -stop-before=ppc-vsx-copy | FileCheck %s --check-prefix CHECK32LARGE
 ; RUN: llc -mtriple powerpc-ibm-aix-xcoff -code-model=large -verify-machineinstrs < %s | FileCheck %s --check-prefix TEST32LARGE
 
+; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -code-model=large -verify-machineinstrs < %s \
+; RUN:     -stop-before=ppc-vsx-copy | FileCheck %s --check-prefix CHECK64LARGE
+; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -code-model=large -verify-machineinstrs < %s | FileCheck %s --check-prefix TEST64LARGE
+
 ; Global variables i and f have the toc-data attribute.
 ; In the following functions, those writing to or reading from
 ; variables i and f should use the toc-data access pattern.
@@ -45,8 +49,8 @@ define dso_local void @write_int(i32 signext %in) {
 
 ; CHECK64-NOOPT:  name: write_int
 ; CHECK64-NOOPT:    %[[SUBREG:[0-9]+]]:gprc = COPY %{{[0-9]}}.sub_32
-; CHECK64-NOOPT:    %[[ADDR:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDItoc8 @i, $x2 :: (load (s64) from got)
-; CHECK64-NOOPT:    STW %[[SUBREG]], 0, killed %[[ADDR]] :: (store (s32) into @i)
+; CHECK64-NOOPT:    %[[ADDR:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDItoc8 @i, $x2
+; CHECK64-NOOPT:    STW %[[SUBREG]], 0, %[[ADDR]]
 
 ; TEST64:         .write_int:
 ; TEST64:           la 4, i[TD](2)
@@ -63,6 +67,17 @@ define dso_local void @write_int(i32 signext %in) {
 ; TEST32LARGE-NEXT:	la 4, i[TD]@l(4)
 ; TEST32LARGE-NEXT:	stw 3, 0(4)
 
+
+; CHECK64LARGE: name:            write_int
+; CHECK64LARGE:      %[[SCRATCH1:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @i
+; CHECK64LARGE-NEXT: %[[SCRATCH2:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDItocL8 killed %[[SCRATCH1]], @i
+; CHECK64LARGE-NEXT: STW8 %{{[0-9]+}}, 0, killed %[[SCRATCH2]] :: (store (s32) into @i)
+
+; TEST64LARGE:         .write_int:
+; TEST64LARGE:          addis 4, i[TD]@u(2)
+; TEST64LARGE-NEXT:	la 4, i[TD]@l(4)
+; TEST64LARGE-NEXT:	stw 3, 0(4)
+
 define dso_local i64 @read_ll() {
   entry:
     %0 = load i64, ptr @ll, align 8
@@ -98,6 +113,15 @@ define dso_local i64 @read_ll() {
 ; TEST32LARGE-NEXT:	lwz 3, 0(4)
 ; TEST32LARGE-NEXT:	lwz 4, 4(4)
 
+; CHECK64LARGE: name:            read_ll
+; CHECK64LARGE: %[[SCRATCH1:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @ll
+; CHECK64LARGE: LDtocL @ll, killed %[[SCRATCH1]] :: (load (s64) from got)
+
+; TEST64LARGE:         .read_ll:
+; TEST64LARGE:          addis 3, L..C0@u(2)
+; TEST64LARGE-NEXT:	ld 3, L..C0@l(3)
+; TEST64LARGE-NEXT:	ld 3, 0(3)
+
 define dso_local float @read_float() {
   entry:
     %0 = load float, ptr @f, align 4
@@ -117,7 +141,7 @@ define dso_local float @read_float() {
 
 ; CHECK64-NOOPT: name:            read_float
 ; CHECK64-NOOPT:   %[[SCRATCH:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDItoc8 @f, $x2
-; CHECK64-NOOPT:   %{{[0-9]+}}:f4rc = LFS 0, killed %[[SCRATCH]]
+; CHECK64-NOOPT:   %{{[0-9]+}}:f4rc = LFS 0, %[[SCRATCH]]
 
 ; TEST64:       .read_float:
 ; TEST64:         la 3, f[TD](2)
@@ -134,6 +158,18 @@ define dso_local float @read_float() {
 ; TEST32LARGE-NEXT:	la 3, f[TD]@l(3)
 ; TEST32LARGE-NEXT:	lfs 1, 0(3)
 
+
+; CHECK64LARGE: name:            read_float
+; CHECK64LARGE:      %[[SCRATCH1:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @f
+; CHECK64LARGE-NEXT: %[[SCRATCH2:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDItocL8 killed %[[SCRATCH1]], @f
+; CHECK64LARGE-NEXT: LFS 0, killed %[[SCRATCH2]] :: (dereferenceable load (s32) from @f)
+
+
+; TEST64LARGE:         .read_float:
+; TEST64LARGE:          addis 3, f[TD]@u(2)
+; TEST64LARGE-NEXT:	la 3, f[TD]@l(3)
+; TEST64LARGE-NEXT:	lfs 1, 0(3)
+
 define dso_local void @write_double(double %in) {
   entry:
     store double %in, ptr @d, align 8
@@ -167,6 +203,15 @@ define dso_local void @write_double(double %in) {
 ; TEST32LARGE-NEXT:	lwz 3, L..C1@l(3)
 ; TEST32LARGE-NEXT:	stfd 1, 0(3)
 
+; CHECK64LARGE: name:            write_double
+; CHECK64LARGE: %[[SCRATCH1:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @d
+; CHECK64LARGE: LDtocL @d, killed %[[SCRATCH1]] :: (load (s64) from got)
+
+; TEST64LARGE:         .write_double:
+; TEST64LARGE:          addis 3, L..C1@u(2)
+; TEST64LARGE-NEXT:	ld 3, L..C1@l(3)
+; TEST64LARGE-NEXT:	stfd 1, 0(3)
+
 define dso_local nonnull ptr @addr() {
   entry:
     ret ptr @i
@@ -183,7 +228,7 @@ define dso_local nonnull ptr @addr() {
 ; CHECK64-NEXT:  $x3 = COPY %[[SCRATCH]]
 
 ; CHECK64-NOOPT: name:            addr
-; CHECK64-NOOPT:   %[[SCRATCH:[0-9]+]]:g8rc = ADDItoc8 @i, $x2
+; CHECK64-NOOPT:   %[[SCRATCH:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDItoc8 @i, $x2
 ; CHECK64-NOOPT:   $x3 = COPY %[[SCRATCH]]
 
 ; TEST64:       .addr
@@ -237,4 +282,26 @@ define dso_local nonnull ptr @addr() {
 ; TEST32LARGE-NEXT:      .globl f[TD]
 ; TEST32LARGE-NOT:       .tc f[TE],f[RW]
 
+; CHECK64LARGE: name:            addr
+; CHECK64LARGE:      %[[SCRATCH1:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @i
+; CHECK64LARGE-NEXT: %[[SCRATCH2:[0-9]+]]:g8rc = ADDItocL8 killed %[[SCRATCH1]], @i
+; CHECK64LARGE-NEXT: $x3 = COPY %[[SCRATCH2]]
+
+; TEST64LARGE:         .addr:
+; TEST64LARGE:          addis 3, i[TD]@u(2)
+; TEST64LARGE:          la 3, i[TD]@l(3)
+
+; TEST64LARGE:         .toc
+; TEST64LARGE:           .tc ll[TE],ll[RW]
+; TEST64LARGE-NOT:       .csect ll[TD]
+; TEST64LARGE:           .tc d[TE],d[RW]
+; TEST64LARGE-NOT:       .csect d[TD],2
+; TEST64LARGE:           .csect i[TD],2
+; TEST64LARGE-NEXT:      .globl  i[TD]
+; TEST64LARGE-NEXT:      .align  2
+; TEST64LARGE-NOT:       .tc i[TE],i[RW]
+; TEST64LARGE:           .csect f[TD],2
+; TEST64LARGE-NEXT:      .globl f[TD]
+; TEST64LARGE-NOT:       .tc f[TE],f[RW]
+
 attributes #0 = { "toc-data" }
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll
new file mode 100644
index 000000000000..70d1b25309c8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv32.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -global-isel -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV32
+
+define i16 @constant_fold_barrier_i16(i16 %x, i16 %y) {
+; RV32-LABEL: constant_fold_barrier_i16:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    slli a1, a1, 11
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    addi a1, a1, 289
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+entry:
+  %and = and i16 %x, 2048
+  %or = or i16 %and, 2337
+  ret i16 %or
+}
+
+define void @constant_fold_barrier_i128(ptr %p) {
+; RV32-LABEL: constant_fold_barrier_i128:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a1, 1
+; RV32-NEXT:    slli a1, a1, 11
+; RV32-NEXT:    lw a2, 0(a0)
+; RV32-NEXT:    lw a3, 4(a0)
+; RV32-NEXT:    lw a4, 8(a0)
+; RV32-NEXT:    lw a5, 12(a0)
+; RV32-NEXT:    and a2, a2, a1
+; RV32-NEXT:    and a3, a3, zero
+; RV32-NEXT:    and a4, a4, zero
+; RV32-NEXT:    and a5, a5, zero
+; RV32-NEXT:    add a2, a2, a1
+; RV32-NEXT:    sltu a1, a2, a1
+; RV32-NEXT:    add a6, a3, zero
+; RV32-NEXT:    sltu a3, a6, a3
+; RV32-NEXT:    add a6, a6, a1
+; RV32-NEXT:    seqz a7, a6
+; RV32-NEXT:    and a1, a7, a1
+; RV32-NEXT:    or a1, a3, a1
+; RV32-NEXT:    add a3, a4, zero
+; RV32-NEXT:    sltu a4, a3, a4
+; RV32-NEXT:    add a3, a3, a1
+; RV32-NEXT:    seqz a7, a3
+; RV32-NEXT:    and a1, a7, a1
+; RV32-NEXT:    or a1, a4, a1
+; RV32-NEXT:    add a5, a5, zero
+; RV32-NEXT:    add a1, a5, a1
+; RV32-NEXT:    sw a2, 0(a0)
+; RV32-NEXT:    sw a6, 4(a0)
+; RV32-NEXT:    sw a3, 8(a0)
+; RV32-NEXT:    sw a1, 12(a0)
+; RV32-NEXT:    ret
+entry:
+  %x = load i128, ptr %p
+  %and = and i128 %x, 2048
+  %add = add i128 %and, 2048
+  store i128 %add, ptr %p
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv64.ll
new file mode 100644
index 000000000000..21d7b1d70714
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/constbarrier-rv64.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv64 -global-isel -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV64
+
+define i16 @constant_fold_barrier_i16(i16 %x, i16 %y) {
+; RV64-LABEL: constant_fold_barrier_i16:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a1, 1
+; RV64-NEXT:    slli a1, a1, 11
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    addiw a1, a1, 289
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+entry:
+  %and = and i16 %x, 2048
+  %or = or i16 %and, 2337
+  ret i16 %or
+}
+
+define i128 @constant_fold_barrier_i128(i128 %x) {
+; RV64-LABEL: constant_fold_barrier_i128:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a2, 1
+; RV64-NEXT:    slli a2, a2, 11
+; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    and a1, a1, zero
+; RV64-NEXT:    add a0, a0, a2
+; RV64-NEXT:    sltu a2, a0, a2
+; RV64-NEXT:    add a1, a1, zero
+; RV64-NEXT:    add a1, a1, a2
+; RV64-NEXT:    ret
+entry:
+  %and = and i128 %x, 2048
+  %add = add i128 %and, 2048
+  ret i128 %add
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv32.mir
index 6b1fc2042e2b..bbe8ef4b092d 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv32.mir
@@ -16,6 +16,38 @@ body:             |
 
 ...
 ---
+name:            constbarrier_i16
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: constbarrier_i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2048
+    ; CHECK-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:_(s32) = G_CONSTANT_FOLD_BARRIER [[C]]
+    ; CHECK-NEXT: $x10 = COPY [[CONSTANT_FOLD_BARRIER]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s16) = G_CONSTANT i16 2048
+    %2:_(s16) = G_CONSTANT_FOLD_BARRIER %1
+    %3:_(s32) = G_ANYEXT %2(s16)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            constbarrier_i128
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: constbarrier_i128
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2048
+    ; CHECK-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:_(s32) = G_CONSTANT_FOLD_BARRIER [[C]]
+    ; CHECK-NEXT: $x10 = COPY [[CONSTANT_FOLD_BARRIER]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s128) = G_CONSTANT i128 2048
+    %2:_(s128) = G_CONSTANT_FOLD_BARRIER %1
+    %3:_(s32) = G_TRUNC %2(s128)
+    $x10 = COPY %3(s32)
+    PseudoRET implicit $x10
+
+...
+---
 name:            constbarrier_nxv2i1
 body:             |
   bb.0.entry:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv64.mir
index de6a82beee2a..96b1aa53d46e 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-constbarrier-rv64.mir
@@ -33,6 +33,39 @@ body:             |
 
 ...
 ---
+name:            constbarrier_i16
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: constbarrier_i16
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2048
+    ; CHECK-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:_(s32) = G_CONSTANT_FOLD_BARRIER [[C]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[CONSTANT_FOLD_BARRIER]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s16) = G_CONSTANT i16 2048
+    %2:_(s16) = G_CONSTANT_FOLD_BARRIER %1
+    %3:_(s64) = G_ANYEXT %2(s16)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            constbarrier_i128
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: constbarrier_i128
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2048
+    ; CHECK-NEXT: [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:_(s64) = G_CONSTANT_FOLD_BARRIER [[C]]
+    ; CHECK-NEXT: $x10 = COPY [[CONSTANT_FOLD_BARRIER]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s128) = G_CONSTANT i128 2048
+    %2:_(s128) = G_CONSTANT_FOLD_BARRIER %1
+    %3:_(s64) = G_TRUNC %2(s128)
+    $x10 = COPY %3(s64)
+    PseudoRET implicit $x10
+
+...
+---
 name:            constbarrier_nxv2i1
 body:             |
   bb.0.entry:
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv32.mir
index 4177a40e3826..26d8785afb47 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv32.mir
@@ -555,3 +555,93 @@ body:             |
     PseudoRET implicit $x10, implicit $x11
 
 ...
+---
+name:            udivrem_i32
+body:             |
+  bb.1.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-I-LABEL: name: udivrem_i32
+    ; CHECK-I: liveins: $x10, $x11
+    ; CHECK-I-NEXT: {{  $}}
+    ; CHECK-I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-I-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-I-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: $x10 = COPY [[COPY]](s32)
+    ; CHECK-I-NEXT: $x11 = COPY [[COPY1]](s32)
+    ; CHECK-I-NEXT: PseudoCALL target-flags(riscv-call) &__udivsi3, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: $x10 = COPY [[COPY]](s32)
+    ; CHECK-I-NEXT: $x11 = COPY [[COPY1]](s32)
+    ; CHECK-I-NEXT: PseudoCALL target-flags(riscv-call) &__umodsi3, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-I-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY3]]
+    ; CHECK-I-NEXT: $x10 = COPY [[ADD]](s32)
+    ; CHECK-I-NEXT: PseudoRET implicit $x10
+    ;
+    ; CHECK-M-LABEL: name: udivrem_i32
+    ; CHECK-M: liveins: $x10, $x11
+    ; CHECK-M-NEXT: {{  $}}
+    ; CHECK-M-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-M-NEXT: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[COPY]], [[COPY1]]
+    ; CHECK-M-NEXT: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[COPY]], [[COPY1]]
+    ; CHECK-M-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[UDIV]], [[UREM]]
+    ; CHECK-M-NEXT: $x10 = COPY [[ADD]](s32)
+    ; CHECK-M-NEXT: PseudoRET implicit $x10
+    %0:_(s32) = COPY $x10
+    %1:_(s32) = COPY $x11
+    %2:_(s32), %3:_(s32) = G_UDIVREM %0, %1
+    %4:_(s32) = G_ADD %2, %3
+    $x10 = COPY %4(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            sdivrem_i32
+body:             |
+  bb.1.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-I-LABEL: name: sdivrem_i32
+    ; CHECK-I: liveins: $x10, $x11
+    ; CHECK-I-NEXT: {{  $}}
+    ; CHECK-I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-I-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-I-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: $x10 = COPY [[COPY]](s32)
+    ; CHECK-I-NEXT: $x11 = COPY [[COPY1]](s32)
+    ; CHECK-I-NEXT: PseudoCALL target-flags(riscv-call) &__divsi3, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: $x10 = COPY [[COPY]](s32)
+    ; CHECK-I-NEXT: $x11 = COPY [[COPY1]](s32)
+    ; CHECK-I-NEXT: PseudoCALL target-flags(riscv-call) &__modsi3, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-I-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY3]]
+    ; CHECK-I-NEXT: $x10 = COPY [[ADD]](s32)
+    ; CHECK-I-NEXT: PseudoRET implicit $x10
+    ;
+    ; CHECK-M-LABEL: name: sdivrem_i32
+    ; CHECK-M: liveins: $x10, $x11
+    ; CHECK-M-NEXT: {{  $}}
+    ; CHECK-M-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-M-NEXT: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[COPY]], [[COPY1]]
+    ; CHECK-M-NEXT: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[COPY]], [[COPY1]]
+    ; CHECK-M-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SDIV]], [[SREM]]
+    ; CHECK-M-NEXT: $x10 = COPY [[ADD]](s32)
+    ; CHECK-M-NEXT: PseudoRET implicit $x10
+    %0:_(s32) = COPY $x10
+    %1:_(s32) = COPY $x11
+    %2:_(s32), %3:_(s32) = G_SDIVREM %0, %1
+    %4:_(s32) = G_ADD %2, %3
+    $x10 = COPY %4(s32)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv64.mir
index 492f9530997c..bbbe38f695d2 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv64.mir
@@ -655,3 +655,93 @@ body:             |
     PseudoRET implicit $x10, implicit $x11
 
 ...
+---
+name:            udivrem_i64
+body:             |
+  bb.1.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-I-LABEL: name: udivrem_i64
+    ; CHECK-I: liveins: $x10, $x11
+    ; CHECK-I-NEXT: {{  $}}
+    ; CHECK-I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-I-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-I-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: $x10 = COPY [[COPY]](s64)
+    ; CHECK-I-NEXT: $x11 = COPY [[COPY1]](s64)
+    ; CHECK-I-NEXT: PseudoCALL target-flags(riscv-call) &__udivdi3, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: $x10 = COPY [[COPY]](s64)
+    ; CHECK-I-NEXT: $x11 = COPY [[COPY1]](s64)
+    ; CHECK-I-NEXT: PseudoCALL target-flags(riscv-call) &__umoddi3, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY2]], [[COPY3]]
+    ; CHECK-I-NEXT: $x10 = COPY [[ADD]](s64)
+    ; CHECK-I-NEXT: PseudoRET implicit $x10
+    ;
+    ; CHECK-M-LABEL: name: udivrem_i64
+    ; CHECK-M: liveins: $x10, $x11
+    ; CHECK-M-NEXT: {{  $}}
+    ; CHECK-M-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-M-NEXT: [[UDIV:%[0-9]+]]:_(s64) = G_UDIV [[COPY]], [[COPY1]]
+    ; CHECK-M-NEXT: [[UREM:%[0-9]+]]:_(s64) = G_UREM [[COPY]], [[COPY1]]
+    ; CHECK-M-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[UDIV]], [[UREM]]
+    ; CHECK-M-NEXT: $x10 = COPY [[ADD]](s64)
+    ; CHECK-M-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = COPY $x10
+    %1:_(s64) = COPY $x11
+    %2:_(s64), %3:_(s64) = G_UDIVREM %0, %1
+    %4:_(s64) = G_ADD %2, %3
+    $x10 = COPY %4(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            sdivrem_i64
+body:             |
+  bb.1.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-I-LABEL: name: sdivrem_i64
+    ; CHECK-I: liveins: $x10, $x11
+    ; CHECK-I-NEXT: {{  $}}
+    ; CHECK-I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-I-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-I-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: $x10 = COPY [[COPY]](s64)
+    ; CHECK-I-NEXT: $x11 = COPY [[COPY1]](s64)
+    ; CHECK-I-NEXT: PseudoCALL target-flags(riscv-call) &__divdi3, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: $x10 = COPY [[COPY]](s64)
+    ; CHECK-I-NEXT: $x11 = COPY [[COPY1]](s64)
+    ; CHECK-I-NEXT: PseudoCALL target-flags(riscv-call) &__moddi3, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-I-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-I-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY2]], [[COPY3]]
+    ; CHECK-I-NEXT: $x10 = COPY [[ADD]](s64)
+    ; CHECK-I-NEXT: PseudoRET implicit $x10
+    ;
+    ; CHECK-M-LABEL: name: sdivrem_i64
+    ; CHECK-M: liveins: $x10, $x11
+    ; CHECK-M-NEXT: {{  $}}
+    ; CHECK-M-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-M-NEXT: [[SDIV:%[0-9]+]]:_(s64) = G_SDIV [[COPY]], [[COPY1]]
+    ; CHECK-M-NEXT: [[SREM:%[0-9]+]]:_(s64) = G_SREM [[COPY]], [[COPY1]]
+    ; CHECK-M-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[SDIV]], [[SREM]]
+    ; CHECK-M-NEXT: $x10 = COPY [[ADD]](s64)
+    ; CHECK-M-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = COPY $x10
+    %1:_(s64) = COPY $x11
+    %2:_(s64), %3:_(s64) = G_SDIVREM %0, %1
+    %4:_(s64) = G_ADD %2, %3
+    $x10 = COPY %4(s64)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-frem-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-frem-rv32.mir
new file mode 100644
index 000000000000..adf3f450af7d
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-frem-rv32.mir
@@ -0,0 +1,130 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=riscv32 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck %s
+---
+name:            frem_f32
+body:             |
+  bb.1.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: frem_f32
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: $x10 = COPY [[COPY]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[COPY1]](s32)
+    ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &fmodf, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: $x10 = COPY [[COPY2]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s32) = COPY $x10
+    %1:_(s32) = COPY $x11
+    %2:_(s32) = G_FREM %0, %1
+    $x10 = COPY %2(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            frem_f64
+body:             |
+  bb.1.entry:
+    liveins: $x10, $x11, $x12, $x13
+
+    ; CHECK-LABEL: name: frem_f64
+    ; CHECK: liveins: $x10, $x11, $x12, $x13
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x12
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x13
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: $x10 = COPY [[COPY]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[COPY1]](s32)
+    ; CHECK-NEXT: $x12 = COPY [[COPY2]](s32)
+    ; CHECK-NEXT: $x13 = COPY [[COPY3]](s32)
+    ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &fmod, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit-def $x10, implicit-def $x11
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: $x10 = COPY [[COPY4]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[COPY5]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
+    %2:_(s32) = COPY $x10
+    %3:_(s32) = COPY $x11
+    %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+    %4:_(s32) = COPY $x12
+    %5:_(s32) = COPY $x13
+    %1:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s64) = G_FREM %0, %1
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(s64)
+    $x10 = COPY %7(s32)
+    $x11 = COPY %8(s32)
+    PseudoRET implicit $x10, implicit $x11
+
+...
+---
+name:            frem_f16
+body:             |
+  bb.0.entry:
+
+    ; CHECK-LABEL: name: frem_f16
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
+    ; CHECK-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: $x10 = COPY [[FPEXT]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[FPEXT1]](s32)
+    ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &fmodf, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[COPY2]](s32)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s32) = COPY $x10
+    %1:_(s32) = COPY $x11
+    %2:_(s16) = G_TRUNC %0(s32)
+    %3:_(s16) = G_TRUNC %1(s32)
+    %4:_(s16) = G_FREM %2, %3
+    %5:_(s32) = G_ANYEXT %4(s16)
+    $x10 = COPY %5(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            frem_v2f32
+body:             |
+  bb.0.entry:
+
+    ; CHECK-LABEL: name: frem_v2f32
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $v8
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $v9
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: $x10 = COPY [[UV]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[UV2]](s32)
+    ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &fmodf, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: $x10 = COPY [[UV1]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[UV3]](s32)
+    ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &fmodf, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<2 x s32>) = COPY $v8
+    %1:_(<2 x s32>) = COPY $v9
+    %2:_(<2 x s32>) = G_FREM %0, %1
+    $v8 = COPY %2(<2 x s32>)
+    PseudoRET implicit $v8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-frem-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-frem-rv64.mir
new file mode 100644
index 000000000000..5db66bbf8e52
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-frem-rv64.mir
@@ -0,0 +1,130 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=riscv64 -run-pass=legalizer -global-isel-abort=0 %s -o - | FileCheck %s
+---
+name:            frem_f32
+body:             |
+  bb.1.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: frem_f32
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: $x10 = COPY [[COPY]](s64)
+    ; CHECK-NEXT: $x11 = COPY [[COPY1]](s64)
+    ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &fmodf, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: $x10 = COPY [[COPY2]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %2:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %2(s64)
+    %3:_(s64) = COPY $x11
+    %1:_(s32) = G_TRUNC %3(s64)
+    %4:_(s32) = G_FREM %0, %1
+    %5:_(s64) = G_ANYEXT %4(s32)
+    $x10 = COPY %5(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            frem_f64
+body:             |
+  bb.1.entry:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: frem_f64
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: $x10 = COPY [[COPY]](s64)
+    ; CHECK-NEXT: $x11 = COPY [[COPY1]](s64)
+    ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &fmod, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: $x10 = COPY [[COPY2]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = COPY $x10
+    %1:_(s64) = COPY $x11
+    %2:_(s64) = G_FREM %0, %1
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            frem_f16
+body:             |
+  bb.0.entry:
+
+    ; CHECK-LABEL: name: frem_f16
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
+    ; CHECK-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[FPEXT]](s32)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[FPEXT1]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: $x11 = COPY [[ANYEXT1]](s64)
+    ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &fmodf, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64)
+    ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[TRUNC2]](s32)
+    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[FPTRUNC]](s16)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT2]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = COPY $x10
+    %1:_(s64) = COPY $x11
+    %2:_(s16) = G_TRUNC %0(s64)
+    %3:_(s16) = G_TRUNC %1(s64)
+    %4:_(s16) = G_FREM %2, %3
+    %5:_(s64) = G_ANYEXT %4(s16)
+    $x10 = COPY %5(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            frem_v2f32
+body:             |
+  bb.0.entry:
+
+    ; CHECK-LABEL: name: frem_v2f32
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $v8
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $v9
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV]](s32)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[UV2]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: $x11 = COPY [[ANYEXT1]](s64)
+    ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &fmodf, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64)
+    ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[UV1]](s32)
+    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[UV3]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT2]](s64)
+    ; CHECK-NEXT: $x11 = COPY [[ANYEXT3]](s64)
+    ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) &fmodf, csr_ilp32_lp64, implicit-def $x1, implicit $x10, implicit $x11, implicit-def $x10
+    ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $x2, implicit $x2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY3]](s64)
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[TRUNC]](s32), [[TRUNC1]](s32)
+    ; CHECK-NEXT: $v8 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<2 x s32>) = COPY $v8
+    %1:_(<2 x s32>) = COPY $v9
+    %2:_(<2 x s32>) = G_FREM %0, %1
+    $v8 = COPY %2(<2 x s32>)
+    PseudoRET implicit $v8
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-lshr-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-lshr-rv64.mir
index 8cbae0fa0173..43318118f09c 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-lshr-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-lshr-rv64.mir
@@ -336,3 +336,29 @@ body:             |
     PseudoRET implicit $x10
 
 ...
+---
+name:            lshr_i32_i48
+body:             |
+  bb.1:
+    liveins: $x10
+
+    ; CHECK-LABEL: name: lshr_i32_i48
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC]], [[C]](s64)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %1:_(s64) = COPY $x10
+    %0:_(s48) = G_TRUNC %1(s64)
+    %2:_(s48) = G_CONSTANT i48 16
+    %6:_(s32) = G_TRUNC %0(s48)
+    %7:_(s32) = G_LSHR %6, %2(s48)
+    %5:_(s64) = G_ANYEXT %7(s32)
+    $x10 = COPY %5(s64)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv32.mir
new file mode 100644
index 000000000000..08aa92e0207b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv32.mir
@@ -0,0 +1,404 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=riscv32 -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s --check-prefixes=CHECK,RV32I
+# RUN: llc -mtriple=riscv32 -mattr=+zbb -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s --check-prefixes=CHECK,RV32ZBB
+
+---
+name:            uaddsat_i32
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; RV32I-LABEL: name: uaddsat_i32
+    ; RV32I: liveins: $x10, $x11
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; RV32I-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY1]]
+    ; RV32I-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
+    ; RV32I-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32I-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[C]], [[COPY2]]
+    ; RV32I-NEXT: $x10 = COPY [[SELECT]](s32)
+    ; RV32I-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV32ZBB-LABEL: name: uaddsat_i32
+    ; RV32ZBB: liveins: $x10, $x11
+    ; RV32ZBB-NEXT: {{  $}}
+    ; RV32ZBB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; RV32ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; RV32ZBB-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32ZBB-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[C]]
+    ; RV32ZBB-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[XOR]], [[COPY1]]
+    ; RV32ZBB-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[UMIN]]
+    ; RV32ZBB-NEXT: $x10 = COPY [[ADD]](s32)
+    ; RV32ZBB-NEXT: PseudoRET implicit $x10
+    %0:_(s32) = COPY $x10
+    %1:_(s32) = COPY $x11
+    %2:_(s32) = G_UADDSAT %0, %1(s32)
+    $x10 = COPY %2(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            uaddsat_i64
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+    ; CHECK-LABEL: name: uaddsat_i64
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x12
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x13
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY3]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]]
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32)
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY5]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY5]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY4]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP3]], [[ICMP1]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[C]], [[COPY4]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[C1]], [[COPY5]]
+    ; CHECK-NEXT: $x10 = COPY [[SELECT1]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[SELECT2]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
+    %2:_(s32) = COPY $x10
+    %3:_(s32) = COPY $x11
+    %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+    %4:_(s32) = COPY $x12
+    %5:_(s32) = COPY $x13
+    %1:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s64) = G_UADDSAT %0, %1(s64)
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(s64)
+    $x10 = COPY %7(s32)
+    $x11 = COPY %8(s32)
+    PseudoRET implicit $x10, implicit $x11
+
+...
+---
+name:            saddsat_i32
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; RV32I-LABEL: name: saddsat_i32
+    ; RV32I: liveins: $x10, $x11
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; RV32I-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
+    ; RV32I-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[ADD]](s32), [[COPY]]
+    ; RV32I-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY1]](s32), [[C]]
+    ; RV32I-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ICMP1]], [[ICMP]]
+    ; RV32I-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
+    ; RV32I-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+    ; RV32I-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY2]], [[C1]](s32)
+    ; RV32I-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; RV32I-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ASHR]], [[C2]]
+    ; RV32I-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[XOR]](s32), [[ADD1]], [[COPY2]]
+    ; RV32I-NEXT: $x10 = COPY [[SELECT]](s32)
+    ; RV32I-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV32ZBB-LABEL: name: saddsat_i32
+    ; RV32ZBB: liveins: $x10, $x11
+    ; RV32ZBB-NEXT: {{  $}}
+    ; RV32ZBB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; RV32ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; RV32ZBB-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
+    ; RV32ZBB-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; RV32ZBB-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32ZBB-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[COPY]], [[C2]]
+    ; RV32ZBB-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SMAX]]
+    ; RV32ZBB-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[COPY]], [[C2]]
+    ; RV32ZBB-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[SMIN]]
+    ; RV32ZBB-NEXT: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[COPY1]]
+    ; RV32ZBB-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB]]
+    ; RV32ZBB-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[SMIN1]]
+    ; RV32ZBB-NEXT: $x10 = COPY [[ADD]](s32)
+    ; RV32ZBB-NEXT: PseudoRET implicit $x10
+    %0:_(s32) = COPY $x10
+    %1:_(s32) = COPY $x11
+    %2:_(s32) = G_SADDSAT %0, %1(s32)
+    $x10 = COPY %2(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            saddsat_i64
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+    ; CHECK-LABEL: name: saddsat_i64
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x12
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x13
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY3]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]]
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY5]](s32), [[COPY1]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY5]](s32), [[COPY1]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY4]](s32), [[COPY]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP3]], [[ICMP1]]
+    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY3]](s32), [[C1]]
+    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY3]](s32), [[C1]]
+    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s32), [[ICMP6]], [[ICMP4]]
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SELECT1]], [[SELECT]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY5]], [[C2]](s32)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+    ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[COPY5]], [[C3]](s32)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ASHR]], [[C4]]
+    ; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD3]](s32), [[C4]]
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[ADD3]](s32)
+    ; CHECK-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ASHR1]], [[C5]]
+    ; CHECK-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[ADD4]], [[ICMP7]]
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[ADD5]](s32)
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[XOR]](s32), [[COPY6]], [[COPY4]]
+    ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[XOR]](s32), [[COPY7]], [[COPY5]]
+    ; CHECK-NEXT: $x10 = COPY [[SELECT2]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[SELECT3]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
+    %2:_(s32) = COPY $x10
+    %3:_(s32) = COPY $x11
+    %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+    %4:_(s32) = COPY $x12
+    %5:_(s32) = COPY $x13
+    %1:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s64) = G_SADDSAT %0, %1(s64)
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(s64)
+    $x10 = COPY %7(s32)
+    $x11 = COPY %8(s32)
+    PseudoRET implicit $x10, implicit $x11
+
+...
+---
+name:            usubsat_i32
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; RV32I-LABEL: name: usubsat_i32
+    ; RV32I: liveins: $x10, $x11
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; RV32I-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[COPY1]]
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY1]]
+    ; RV32I-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32I-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[C]], [[SUB]]
+    ; RV32I-NEXT: $x10 = COPY [[SELECT]](s32)
+    ; RV32I-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV32ZBB-LABEL: name: usubsat_i32
+    ; RV32ZBB: liveins: $x10, $x11
+    ; RV32ZBB-NEXT: {{  $}}
+    ; RV32ZBB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; RV32ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; RV32ZBB-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[COPY]], [[COPY1]]
+    ; RV32ZBB-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[UMIN]]
+    ; RV32ZBB-NEXT: $x10 = COPY [[SUB]](s32)
+    ; RV32ZBB-NEXT: PseudoRET implicit $x10
+    %0:_(s32) = COPY $x10
+    %1:_(s32) = COPY $x11
+    %2:_(s32) = G_USUBSAT %0, %1(s32)
+    $x10 = COPY %2(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            usubsat_i64
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+    ; CHECK-LABEL: name: usubsat_i64
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x12
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x13
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY3]]
+    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[ICMP]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY1]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP3]], [[ICMP1]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[C]], [[SUB]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[C1]], [[SUB2]]
+    ; CHECK-NEXT: $x10 = COPY [[SELECT1]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[SELECT2]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
+    %2:_(s32) = COPY $x10
+    %3:_(s32) = COPY $x11
+    %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+    %4:_(s32) = COPY $x12
+    %5:_(s32) = COPY $x13
+    %1:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s64) = G_USUBSAT %0, %1(s64)
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(s64)
+    $x10 = COPY %7(s32)
+    $x11 = COPY %8(s32)
+    PseudoRET implicit $x10, implicit $x11
+
+...
+---
+name:            ssubsat_i32
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; RV32I-LABEL: name: ssubsat_i32
+    ; RV32I: liveins: $x10, $x11
+    ; RV32I-NEXT: {{  $}}
+    ; RV32I-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; RV32I-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; RV32I-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[COPY1]]
+    ; RV32I-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32I-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[SUB]](s32), [[COPY]]
+    ; RV32I-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY1]](s32), [[C]]
+    ; RV32I-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ICMP1]], [[ICMP]]
+    ; RV32I-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
+    ; RV32I-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+    ; RV32I-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY2]], [[C1]](s32)
+    ; RV32I-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; RV32I-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ASHR]], [[C2]]
+    ; RV32I-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[XOR]](s32), [[ADD]], [[COPY2]]
+    ; RV32I-NEXT: $x10 = COPY [[SELECT]](s32)
+    ; RV32I-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV32ZBB-LABEL: name: ssubsat_i32
+    ; RV32ZBB: liveins: $x10, $x11
+    ; RV32ZBB-NEXT: {{  $}}
+    ; RV32ZBB-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; RV32ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; RV32ZBB-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
+    ; RV32ZBB-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; RV32ZBB-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV32ZBB-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[COPY]], [[C2]]
+    ; RV32ZBB-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SMAX]], [[C]]
+    ; RV32ZBB-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[COPY]], [[C2]]
+    ; RV32ZBB-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SMIN]], [[C1]]
+    ; RV32ZBB-NEXT: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[COPY1]]
+    ; RV32ZBB-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB1]]
+    ; RV32ZBB-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[SMIN1]]
+    ; RV32ZBB-NEXT: $x10 = COPY [[SUB2]](s32)
+    ; RV32ZBB-NEXT: PseudoRET implicit $x10
+    %0:_(s32) = COPY $x10
+    %1:_(s32) = COPY $x11
+    %2:_(s32) = G_SSUBSAT %0, %1(s32)
+    $x10 = COPY %2(s32)
+    PseudoRET implicit $x10
+
+...
+---
+name:            ssubsat_i64
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+    ; CHECK-LABEL: name: ssubsat_i64
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $x12
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $x13
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
+    ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY3]]
+    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[ICMP]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[SUB2]](s32), [[COPY1]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB2]](s32), [[COPY1]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB]](s32), [[COPY]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP3]], [[ICMP1]]
+    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY3]](s32), [[C1]]
+    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY3]](s32), [[C1]]
+    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY2]](s32), [[C]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s32), [[ICMP6]], [[ICMP4]]
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SELECT1]], [[SELECT]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SUB2]], [[C2]](s32)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+    ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SUB2]], [[C3]](s32)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ASHR]], [[C4]]
+    ; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[C4]]
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ASHR1]], [[C5]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP7]]
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD2]](s32)
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[XOR]](s32), [[COPY4]], [[SUB]]
+    ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[XOR]](s32), [[COPY5]], [[SUB2]]
+    ; CHECK-NEXT: $x10 = COPY [[SELECT2]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[SELECT3]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
+    %2:_(s32) = COPY $x10
+    %3:_(s32) = COPY $x11
+    %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
+    %4:_(s32) = COPY $x12
+    %5:_(s32) = COPY $x13
+    %1:_(s64) = G_MERGE_VALUES %4(s32), %5(s32)
+    %6:_(s64) = G_SSUBSAT %0, %1(s64)
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(s64)
+    $x10 = COPY %7(s32)
+    $x11 = COPY %8(s32)
+    PseudoRET implicit $x10, implicit $x11
+
+...
+---
+name:            uaddsat_i8
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: uaddsat_i8
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[AND]](s32), [[AND1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[C2]], [[ADD]]
+    ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %2:_(s32) = COPY $x10
+    %0:_(s8) = G_TRUNC %2(s32)
+    %3:_(s32) = COPY $x11
+    %1:_(s8) = G_TRUNC %3(s32)
+    %4:_(s8) = G_UADDSAT %0, %1(s8)
+    %5:_(s32) = G_ANYEXT %4(s8)
+    $x10 = COPY %5(s32)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv64.mir
new file mode 100644
index 000000000000..5eaf8b37fe98
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-sat-rv64.mir
@@ -0,0 +1,358 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=riscv64 -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s --check-prefixes=CHECK,RV64I
+# RUN: llc -mtriple=riscv64 -mattr=+zbb -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s --check-prefixes=CHECK,RV64ZBB
+
+---
+name:            uaddsat_i32
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: uaddsat_i32
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[ADD]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[ZEXT]](s64), [[AND]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s64), [[C1]], [[COPY2]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SELECT]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %2:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %2(s64)
+    %3:_(s64) = COPY $x11
+    %1:_(s32) = G_TRUNC %3(s64)
+    %4:_(s32) = G_UADDSAT %0, %1(s32)
+    %5:_(s64) = G_ANYEXT %4(s32)
+    $x10 = COPY %5(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            uaddsat_i64
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; RV64I-LABEL: name: uaddsat_i64
+    ; RV64I: liveins: $x10, $x11
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY1]]
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[ADD]](s64), [[COPY1]]
+    ; RV64I-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[ADD]](s64)
+    ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV64I-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[C]], [[COPY2]]
+    ; RV64I-NEXT: $x10 = COPY [[SELECT]](s64)
+    ; RV64I-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV64ZBB-LABEL: name: uaddsat_i64
+    ; RV64ZBB: liveins: $x10, $x11
+    ; RV64ZBB-NEXT: {{  $}}
+    ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; RV64ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; RV64ZBB-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV64ZBB-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY]], [[C]]
+    ; RV64ZBB-NEXT: [[UMIN:%[0-9]+]]:_(s64) = G_UMIN [[XOR]], [[COPY1]]
+    ; RV64ZBB-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[UMIN]]
+    ; RV64ZBB-NEXT: $x10 = COPY [[ADD]](s64)
+    ; RV64ZBB-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = COPY $x10
+    %1:_(s64) = COPY $x11
+    %2:_(s64) = G_UADDSAT %0, %1(s64)
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            saddsat_i32
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: saddsat_i32
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 32
+    ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[SEXT_INREG]], [[SEXT_INREG1]]
+    ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[ADD]], 32
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ne), [[ADD]](s64), [[SEXT_INREG2]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ADD]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 31
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[TRUNC]], [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ASHR]], [[C1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s64), [[ADD1]], [[TRUNC]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SELECT]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %2:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %2(s64)
+    %3:_(s64) = COPY $x11
+    %1:_(s32) = G_TRUNC %3(s64)
+    %4:_(s32) = G_SADDSAT %0, %1(s32)
+    %5:_(s64) = G_ANYEXT %4(s32)
+    $x10 = COPY %5(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            saddsat_i64
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; RV64I-LABEL: name: saddsat_i64
+    ; RV64I: liveins: $x10, $x11
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[COPY1]]
+    ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), [[ADD]](s64), [[COPY]]
+    ; RV64I-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), [[COPY1]](s64), [[C]]
+    ; RV64I-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; RV64I-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; RV64I-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[TRUNC]], [[TRUNC1]]
+    ; RV64I-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[ADD]](s64)
+    ; RV64I-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
+    ; RV64I-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s64)
+    ; RV64I-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
+    ; RV64I-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[ASHR]], [[C2]]
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[XOR]](s32)
+    ; RV64I-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV64I-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C3]]
+    ; RV64I-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[ADD1]], [[COPY2]]
+    ; RV64I-NEXT: $x10 = COPY [[SELECT]](s64)
+    ; RV64I-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV64ZBB-LABEL: name: saddsat_i64
+    ; RV64ZBB: liveins: $x10, $x11
+    ; RV64ZBB-NEXT: {{  $}}
+    ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; RV64ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; RV64ZBB-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807
+    ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
+    ; RV64ZBB-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64ZBB-NEXT: [[SMAX:%[0-9]+]]:_(s64) = G_SMAX [[COPY]], [[C2]]
+    ; RV64ZBB-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[C]], [[SMAX]]
+    ; RV64ZBB-NEXT: [[SMIN:%[0-9]+]]:_(s64) = G_SMIN [[COPY]], [[C2]]
+    ; RV64ZBB-NEXT: [[SUB1:%[0-9]+]]:_(s64) = G_SUB [[C1]], [[SMIN]]
+    ; RV64ZBB-NEXT: [[SMAX1:%[0-9]+]]:_(s64) = G_SMAX [[SUB1]], [[COPY1]]
+    ; RV64ZBB-NEXT: [[SMIN1:%[0-9]+]]:_(s64) = G_SMIN [[SMAX1]], [[SUB]]
+    ; RV64ZBB-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[SMIN1]]
+    ; RV64ZBB-NEXT: $x10 = COPY [[ADD]](s64)
+    ; RV64ZBB-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = COPY $x10
+    %1:_(s64) = COPY $x11
+    %2:_(s64) = G_SADDSAT %0, %1(s64)
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            usubsat_i32
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: usubsat_i32
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[AND]](s64), [[AND1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s64), [[C2]], [[SUB]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SELECT]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %2:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %2(s64)
+    %3:_(s64) = COPY $x11
+    %1:_(s32) = G_TRUNC %3(s64)
+    %4:_(s32) = G_USUBSAT %0, %1(s32)
+    %5:_(s64) = G_ANYEXT %4(s32)
+    $x10 = COPY %5(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            usubsat_i64
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; RV64I-LABEL: name: usubsat_i64
+    ; RV64I: liveins: $x10, $x11
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[COPY1]]
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
+    ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64I-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[C]], [[SUB]]
+    ; RV64I-NEXT: $x10 = COPY [[SELECT]](s64)
+    ; RV64I-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV64ZBB-LABEL: name: usubsat_i64
+    ; RV64ZBB: liveins: $x10, $x11
+    ; RV64ZBB-NEXT: {{  $}}
+    ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; RV64ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; RV64ZBB-NEXT: [[UMIN:%[0-9]+]]:_(s64) = G_UMIN [[COPY]], [[COPY1]]
+    ; RV64ZBB-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[UMIN]]
+    ; RV64ZBB-NEXT: $x10 = COPY [[SUB]](s64)
+    ; RV64ZBB-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = COPY $x10
+    %1:_(s64) = COPY $x11
+    %2:_(s64) = G_USUBSAT %0, %1(s64)
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            ssubsat_i32
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: ssubsat_i32
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 32
+    ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[SEXT_INREG]], [[SEXT_INREG1]]
+    ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SUB]], 32
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ne), [[SUB]](s64), [[SEXT_INREG2]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[SUB]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 31
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[TRUNC]], [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ASHR]], [[C1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s64), [[ADD]], [[TRUNC]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SELECT]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %2:_(s64) = COPY $x10
+    %0:_(s32) = G_TRUNC %2(s64)
+    %3:_(s64) = COPY $x11
+    %1:_(s32) = G_TRUNC %3(s64)
+    %4:_(s32) = G_SSUBSAT %0, %1(s32)
+    %5:_(s64) = G_ANYEXT %4(s32)
+    $x10 = COPY %5(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            ssubsat_i64
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; RV64I-LABEL: name: ssubsat_i64
+    ; RV64I: liveins: $x10, $x11
+    ; RV64I-NEXT: {{  $}}
+    ; RV64I-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; RV64I-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; RV64I-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[COPY1]]
+    ; RV64I-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV64I-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), [[SUB]](s64), [[COPY]]
+    ; RV64I-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]]
+    ; RV64I-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; RV64I-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; RV64I-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[TRUNC]], [[TRUNC1]]
+    ; RV64I-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[SUB]](s64)
+    ; RV64I-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
+    ; RV64I-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s64)
+    ; RV64I-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
+    ; RV64I-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[ASHR]], [[C2]]
+    ; RV64I-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[XOR]](s32)
+    ; RV64I-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV64I-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C3]]
+    ; RV64I-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[ADD]], [[COPY2]]
+    ; RV64I-NEXT: $x10 = COPY [[SELECT]](s64)
+    ; RV64I-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV64ZBB-LABEL: name: ssubsat_i64
+    ; RV64ZBB: liveins: $x10, $x11
+    ; RV64ZBB-NEXT: {{  $}}
+    ; RV64ZBB-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; RV64ZBB-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; RV64ZBB-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807
+    ; RV64ZBB-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
+    ; RV64ZBB-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV64ZBB-NEXT: [[SMAX:%[0-9]+]]:_(s64) = G_SMAX [[COPY]], [[C2]]
+    ; RV64ZBB-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[SMAX]], [[C]]
+    ; RV64ZBB-NEXT: [[SMIN:%[0-9]+]]:_(s64) = G_SMIN [[COPY]], [[C2]]
+    ; RV64ZBB-NEXT: [[SUB1:%[0-9]+]]:_(s64) = G_SUB [[SMIN]], [[C1]]
+    ; RV64ZBB-NEXT: [[SMAX1:%[0-9]+]]:_(s64) = G_SMAX [[SUB]], [[COPY1]]
+    ; RV64ZBB-NEXT: [[SMIN1:%[0-9]+]]:_(s64) = G_SMIN [[SMAX1]], [[SUB1]]
+    ; RV64ZBB-NEXT: [[SUB2:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[SMIN1]]
+    ; RV64ZBB-NEXT: $x10 = COPY [[SUB2]](s64)
+    ; RV64ZBB-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = COPY $x10
+    %1:_(s64) = COPY $x11
+    %2:_(s64) = G_SSUBSAT %0, %1(s64)
+    $x10 = COPY %2(s64)
+    PseudoRET implicit $x10
+
+...
+---
+name:            uaddsat_i8
+body:             |
+  bb.1:
+    liveins: $x10, $x11
+
+    ; CHECK-LABEL: name: uaddsat_i8
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ADD]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[AND]](s64), [[AND1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s64), [[C2]], [[ADD]]
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[SELECT]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT1]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %2:_(s64) = COPY $x10
+    %0:_(s8) = G_TRUNC %2(s64)
+    %3:_(s64) = COPY $x11
+    %1:_(s8) = G_TRUNC %3(s64)
+    %4:_(s8) = G_UADDSAT %0, %1(s8)
+    %5:_(s64) = G_ANYEXT %4(s8)
+    $x10 = COPY %5(s64)
+    PseudoRET implicit $x10
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/libcalls.ll b/llvm/test/CodeGen/RISCV/GlobalISel/libcalls.ll
new file mode 100644
index 000000000000..aaef8d98c812
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/libcalls.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -global-isel -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefix=RV32
+; RUN: llc -mtriple=riscv64 -global-isel -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefix=RV64
+
+define float @test_f32(float %x, float %y) nounwind {
+; RV32-LABEL: test_f32:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    call fmodf
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_f32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    call fmodf
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+entry:
+  %z = frem float %x, %y
+  ret float %z
+}
+
+define double @test_f64(double %x, double %y) nounwind {
+; RV32-LABEL: test_f64:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    call fmod
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_f64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    call fmod
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+entry:
+  %z = frem double %x, %y
+  ret double %z
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/shift.ll b/llvm/test/CodeGen/RISCV/GlobalISel/shift.ll
new file mode 100644
index 000000000000..b75cbf8e871a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/shift.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -global-isel -global-isel-abort=1 -verify-machineinstrs < %s 2>&1 | FileCheck %s --check-prefixes=RV32
+; RUN: llc -mtriple=riscv64 -global-isel -global-isel-abort=1 -verify-machineinstrs < %s 2>&1 | FileCheck %s --check-prefixes=RV64
+
+define i16 @test_lshr_i48(i48 %x) {
+; RV32-LABEL: test_lshr_i48:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srli a0, a0, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_lshr_i48:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srliw a0, a0, 16
+; RV64-NEXT:    ret
+  %lshr = lshr i48 %x, 16
+  %trunc = trunc i48 %lshr to i16
+  ret i16 %trunc
+}
+
+define i16 @test_ashr_i48(i48 %x) {
+; RV32-LABEL: test_ashr_i48:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srai a0, a0, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_ashr_i48:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sraiw a0, a0, 16
+; RV64-NEXT:    ret
+  %ashr = ashr i48 %x, 16
+  %trunc = trunc i48 %ashr to i16
+  ret i16 %trunc
+}
+
+define i16 @test_shl_i48(i48 %x) {
+; RV32-LABEL: test_shl_i48:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a0, a0, 8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: test_shl_i48:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slliw a0, a0, 8
+; RV64-NEXT:    ret
+  %shl = shl i48 %x, 8
+  %trunc = trunc i48 %shl to i16
+  ret i16 %trunc
+}
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index a1eb17956b82..c90bb031e082 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -389,7 +389,7 @@
 ; RV32ZACAS: .attribute 5, "rv32i2p1_a2p1_zacas1p0"
 ; RV32ZALASR: .attribute 5, "rv32i2p1_zalasr0p1"
 ; RV32ZAMA16B: .attribute 5, "rv32i2p1_zama16b1p0"
-; RV32ZICFILP: .attribute 5, "rv32i2p1_zicfilp0p4"
+; RV32ZICFILP: .attribute 5, "rv32i2p1_zicfilp0p4_zicsr2p0"
 ; RV32ZABHA: .attribute 5, "rv32i2p1_a2p1_zabha1p0"
 ; RV32SSNPM: .attribute 5, "rv32i2p1_ssnpm0p8"
 ; RV32SMNPM: .attribute 5, "rv32i2p1_smnpm0p8"
@@ -520,7 +520,7 @@
 ; RV64ZVFBFWMA: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfbfmin1p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvfbfwma1p0_zvl32b1p0"
 ; RV64ZACAS: .attribute 5, "rv64i2p1_a2p1_zacas1p0"
 ; RV64ZALASR: .attribute 5, "rv64i2p1_zalasr0p1"
-; RV64ZICFILP: .attribute 5, "rv64i2p1_zicfilp0p4"
+; RV64ZICFILP: .attribute 5, "rv64i2p1_zicfilp0p4_zicsr2p0"
 ; RV64ZABHA: .attribute 5, "rv64i2p1_a2p1_zabha1p0"
 ; RV64SSNPM: .attribute 5, "rv64i2p1_ssnpm0p8"
 ; RV64SMNPM: .attribute 5, "rv64i2p1_smnpm0p8"
diff --git a/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll b/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll
index 8b22046cb624..8693283e8371 100644
--- a/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll
+++ b/llvm/test/CodeGen/RISCV/loop-strength-reduce-loop-invar.ll
@@ -53,26 +53,24 @@ define void @test(i32 signext %row, i32 signext %N.in) nounwind {
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    blez a1, .LBB0_3
 ; RV64-NEXT:  # %bb.1: # %cond_true.preheader
-; RV64-NEXT:    negw a1, a1
 ; RV64-NEXT:    slli a0, a0, 6
 ; RV64-NEXT:    lui a2, %hi(A)
 ; RV64-NEXT:    addi a2, a2, %lo(A)
 ; RV64-NEXT:    add a0, a0, a2
 ; RV64-NEXT:    addi a2, a0, 4
+; RV64-NEXT:    addiw a1, a1, 2
 ; RV64-NEXT:    li a3, 2
 ; RV64-NEXT:    li a4, 4
 ; RV64-NEXT:    li a5, 5
-; RV64-NEXT:    li a6, 2
 ; RV64-NEXT:  .LBB0_2: # %cond_true
 ; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64-NEXT:    sw a4, 0(a2)
-; RV64-NEXT:    slli a7, a6, 2
-; RV64-NEXT:    add a7, a0, a7
-; RV64-NEXT:    sw a5, 0(a7)
-; RV64-NEXT:    addiw a6, a6, 1
-; RV64-NEXT:    addw a7, a1, a6
+; RV64-NEXT:    slli a6, a3, 2
+; RV64-NEXT:    add a6, a0, a6
+; RV64-NEXT:    sw a5, 0(a6)
+; RV64-NEXT:    addiw a3, a3, 1
 ; RV64-NEXT:    addi a2, a2, 4
-; RV64-NEXT:    bne a7, a3, .LBB0_2
+; RV64-NEXT:    bne a3, a1, .LBB0_2
 ; RV64-NEXT:  .LBB0_3: # %return
 ; RV64-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index 364e8c7b38da..42ea425f99c0 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -1843,3 +1843,152 @@ define i8 @mulsub_demand_2(i8 %x, i8 %y) nounwind {
   %r = or i8 %a, 240
   ret i8 %r
 }
+
+define i64 @muland_demand(i64 %x) nounwind {
+; RV32I-LABEL: muland_demand:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    andi a0, a0, -8
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    srli a1, a1, 2
+; RV32I-NEXT:    li a2, 12
+; RV32I-NEXT:    li a3, 0
+; RV32I-NEXT:    call __muldi3
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IM-LABEL: muland_demand:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    andi a0, a0, -8
+; RV32IM-NEXT:    li a2, 12
+; RV32IM-NEXT:    mul a1, a1, a2
+; RV32IM-NEXT:    mulhu a3, a0, a2
+; RV32IM-NEXT:    add a1, a3, a1
+; RV32IM-NEXT:    mul a0, a0, a2
+; RV32IM-NEXT:    ret
+;
+; RV64I-LABEL: muland_demand:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, -29
+; RV64I-NEXT:    srli a1, a1, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    li a1, 12
+; RV64I-NEXT:    tail __muldi3
+;
+; RV64IM-LABEL: muland_demand:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    andi a0, a0, -8
+; RV64IM-NEXT:    li a1, 12
+; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    ret
+  %and = and i64 %x, 4611686018427387896
+  %mul = mul i64 %and, 12
+  ret i64 %mul
+}
+
+define i64 @mulzext_demand(i32 signext %x) nounwind {
+; RV32I-LABEL: mulzext_demand:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    li a3, 3
+; RV32I-NEXT:    li a2, 0
+; RV32I-NEXT:    call __muldi3
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IM-LABEL: mulzext_demand:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    slli a1, a0, 1
+; RV32IM-NEXT:    add a1, a1, a0
+; RV32IM-NEXT:    li a0, 0
+; RV32IM-NEXT:    ret
+;
+; RV64I-LABEL: mulzext_demand:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 3
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    tail __muldi3
+;
+; RV64IM-LABEL: mulzext_demand:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    li a1, 3
+; RV64IM-NEXT:    slli a1, a1, 32
+; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    ret
+  %ext = zext i32 %x to i64
+  %mul = mul i64 %ext, 12884901888
+  ret i64 %mul
+}
+
+define i32 @mulfshl_demand(i32 signext %x) nounwind {
+; RV32I-LABEL: mulfshl_demand:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    srli a0, a0, 11
+; RV32I-NEXT:    lui a1, 92808
+; RV32I-NEXT:    tail __mulsi3
+;
+; RV32IM-LABEL: mulfshl_demand:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    srli a0, a0, 11
+; RV32IM-NEXT:    lui a1, 92808
+; RV32IM-NEXT:    mul a0, a0, a1
+; RV32IM-NEXT:    ret
+;
+; RV64I-LABEL: mulfshl_demand:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    srliw a0, a0, 11
+; RV64I-NEXT:    lui a1, 92808
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: mulfshl_demand:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    srliw a0, a0, 11
+; RV64IM-NEXT:    lui a1, 92808
+; RV64IM-NEXT:    mulw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %fshl = tail call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 21)
+  %mul = mul i32 %fshl, 380141568
+  ret i32 %mul
+}
+
+define i32 @mulor_demand(i32 signext %x, i32 signext %y) nounwind {
+; RV32I-LABEL: mulor_demand:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a1, 92808
+; RV32I-NEXT:    tail __mulsi3
+;
+; RV32IM-LABEL: mulor_demand:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    lui a1, 92808
+; RV32IM-NEXT:    mul a0, a0, a1
+; RV32IM-NEXT:    ret
+;
+; RV64I-LABEL: mulor_demand:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lui a1, 92808
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: mulor_demand:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    lui a1, 92808
+; RV64IM-NEXT:    mulw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %mul1 = mul i32 %y, 10485760
+  %or = or disjoint i32 %mul1, %x
+  %mul2 = mul i32 %or, 380141568
+  ret i32 %mul2
+}
diff --git a/llvm/test/CodeGen/RISCV/pr90730.ll b/llvm/test/CodeGen/RISCV/pr90730.ll
new file mode 100644
index 000000000000..7c3f4b43089c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr90730.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv64 -mattr=+zbb | FileCheck %s
+
+define i32 @pr90730(i32 %x, i1 %y, ptr %p) {
+; CHECK-LABEL: pr90730:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    addiw a1, a1, -960
+; CHECK-NEXT:    andn a0, a1, a0
+; CHECK-NEXT:    sw zero, 0(a2)
+; CHECK-NEXT:    ret
+entry:
+  %ext = zext i1 %y to i32
+  %xor1 = xor i32 %ext, 31817
+  %and1 = and i32 %xor1, %x
+  store i32 %and1, ptr %p, align 4
+  %v = load i32, ptr %p, align 4
+  %and2 = and i32 %v, 31808
+  %xor2 = xor i32 %and2, 31808
+  store i32 0, ptr %p, align 4
+  ret i32 %xor2
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
index 2db0d40b0ce5..cf7be57ccc90 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
@@ -637,8 +637,6 @@ define i64 @zext_mul288(i32 signext %a) {
 define i64 @zext_mul12884901888(i32 signext %a) {
 ; RV64I-LABEL: zext_mul12884901888:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    li a1, 3
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    mul a0, a0, a1
@@ -646,8 +644,8 @@ define i64 @zext_mul12884901888(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul12884901888:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    sh1add a0, a0, a0
+; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
   %b = zext i32 %a to i64
   %c = mul i64 %b, 12884901888
@@ -658,8 +656,6 @@ define i64 @zext_mul12884901888(i32 signext %a) {
 define i64 @zext_mul21474836480(i32 signext %a) {
 ; RV64I-LABEL: zext_mul21474836480:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    li a1, 5
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    mul a0, a0, a1
@@ -667,8 +663,8 @@ define i64 @zext_mul21474836480(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul21474836480:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    sh2add a0, a0, a0
+; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
   %b = zext i32 %a to i64
   %c = mul i64 %b, 21474836480
@@ -679,8 +675,6 @@ define i64 @zext_mul21474836480(i32 signext %a) {
 define i64 @zext_mul38654705664(i32 signext %a) {
 ; RV64I-LABEL: zext_mul38654705664:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    li a1, 9
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    mul a0, a0, a1
@@ -688,8 +682,8 @@ define i64 @zext_mul38654705664(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul38654705664:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    sh3add a0, a0, a0
+; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
   %b = zext i32 %a to i64
   %c = mul i64 %b, 38654705664
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index dc93c0215a25..4a568fb2b25c 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -856,8 +856,6 @@ define i64 @zext_mul288(i32 signext %a) {
 define i64 @zext_mul12884901888(i32 signext %a) {
 ; RV64I-LABEL: zext_mul12884901888:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    li a1, 3
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    mul a0, a0, a1
@@ -865,8 +863,8 @@ define i64 @zext_mul12884901888(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul12884901888:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    sh1add a0, a0, a0
+; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
   %b = zext i32 %a to i64
   %c = mul i64 %b, 12884901888
@@ -877,8 +875,6 @@ define i64 @zext_mul12884901888(i32 signext %a) {
 define i64 @zext_mul21474836480(i32 signext %a) {
 ; RV64I-LABEL: zext_mul21474836480:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    li a1, 5
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    mul a0, a0, a1
@@ -886,8 +882,8 @@ define i64 @zext_mul21474836480(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul21474836480:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    sh2add a0, a0, a0
+; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
   %b = zext i32 %a to i64
   %c = mul i64 %b, 21474836480
@@ -898,8 +894,6 @@ define i64 @zext_mul21474836480(i32 signext %a) {
 define i64 @zext_mul38654705664(i32 signext %a) {
 ; RV64I-LABEL: zext_mul38654705664:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    li a1, 9
 ; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    mul a0, a0, a1
@@ -907,8 +901,8 @@ define i64 @zext_mul38654705664(i32 signext %a) {
 ;
 ; RV64ZBA-LABEL: zext_mul38654705664:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    sh3add a0, a0, a0
+; RV64ZBA-NEXT:    slli a0, a0, 32
 ; RV64ZBA-NEXT:    ret
   %b = zext i32 %a to i64
   %c = mul i64 %b, 38654705664
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-costrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll
index f189354237ee..f189354237ee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-costrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll
diff --git a/llvm/test/CodeGen/RISCV/rvv/fround-costrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll
index 3276f481f30e..3276f481f30e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fround-costrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll
diff --git a/llvm/test/CodeGen/RISCV/sextw-removal.ll b/llvm/test/CodeGen/RISCV/sextw-removal.ll
index f707cb31e3ec..8cf78551d28f 100644
--- a/llvm/test/CodeGen/RISCV/sextw-removal.ll
+++ b/llvm/test/CodeGen/RISCV/sextw-removal.ll
@@ -1047,25 +1047,25 @@ define signext i32 @bug(i32 signext %x) {
 ; CHECK-NEXT:    seqz a2, a2
 ; CHECK-NEXT:    slli a3, a2, 3
 ; CHECK-NEXT:    sllw a1, a1, a3
-; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    negw a2, a2
 ; CHECK-NEXT:    andi a2, a2, -8
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    srliw a2, a1, 28
 ; CHECK-NEXT:    seqz a2, a2
 ; CHECK-NEXT:    slli a3, a2, 2
 ; CHECK-NEXT:    sllw a1, a1, a3
-; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    negw a2, a2
 ; CHECK-NEXT:    andi a2, a2, -4
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    srliw a2, a1, 30
 ; CHECK-NEXT:    seqz a2, a2
 ; CHECK-NEXT:    slli a3, a2, 1
 ; CHECK-NEXT:    sllw a1, a1, a3
-; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    negw a2, a2
 ; CHECK-NEXT:    andi a2, a2, -2
 ; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    srai a1, a1, 31
 ; CHECK-NEXT:    not a1, a1
+; CHECK-NEXT:    srli a1, a1, 31
 ; CHECK-NEXT:    addw a0, a0, a1
 ; CHECK-NEXT:  .LBB18_4: # %cleanup
 ; CHECK-NEXT:    ret
@@ -1087,28 +1087,27 @@ define signext i32 @bug(i32 signext %x) {
 ; NOREMOVAL-NEXT:    seqz a2, a2
 ; NOREMOVAL-NEXT:    slli a3, a2, 3
 ; NOREMOVAL-NEXT:    sllw a1, a1, a3
-; NOREMOVAL-NEXT:    neg a2, a2
+; NOREMOVAL-NEXT:    negw a2, a2
 ; NOREMOVAL-NEXT:    andi a2, a2, -8
 ; NOREMOVAL-NEXT:    add a0, a0, a2
 ; NOREMOVAL-NEXT:    srliw a2, a1, 28
 ; NOREMOVAL-NEXT:    seqz a2, a2
 ; NOREMOVAL-NEXT:    slli a3, a2, 2
 ; NOREMOVAL-NEXT:    sllw a1, a1, a3
-; NOREMOVAL-NEXT:    neg a2, a2
+; NOREMOVAL-NEXT:    negw a2, a2
 ; NOREMOVAL-NEXT:    andi a2, a2, -4
 ; NOREMOVAL-NEXT:    add a0, a0, a2
 ; NOREMOVAL-NEXT:    srliw a2, a1, 30
 ; NOREMOVAL-NEXT:    seqz a2, a2
 ; NOREMOVAL-NEXT:    slli a3, a2, 1
 ; NOREMOVAL-NEXT:    sllw a1, a1, a3
-; NOREMOVAL-NEXT:    neg a2, a2
+; NOREMOVAL-NEXT:    negw a2, a2
 ; NOREMOVAL-NEXT:    andi a2, a2, -2
 ; NOREMOVAL-NEXT:    add a0, a0, a2
-; NOREMOVAL-NEXT:    srai a1, a1, 31
 ; NOREMOVAL-NEXT:    not a1, a1
-; NOREMOVAL-NEXT:    add a0, a0, a1
+; NOREMOVAL-NEXT:    srli a1, a1, 31
+; NOREMOVAL-NEXT:    addw a0, a0, a1
 ; NOREMOVAL-NEXT:  .LBB18_4: # %cleanup
-; NOREMOVAL-NEXT:    sext.w a0, a0
 ; NOREMOVAL-NEXT:    ret
 entry:
   %tobool.not = icmp eq i32 %x, 0
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_inline_assembly/inline_asm.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_inline_assembly/inline_asm.ll
new file mode 100644
index 000000000000..449dd7195450
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_inline_assembly/inline_asm.ll
@@ -0,0 +1,93 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s --spirv-ext=+SPV_INTEL_inline_assembly -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s --spirv-ext=+SPV_INTEL_inline_assembly -o - -filetype=obj | spirv-val %}
+
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; CHECK-ERROR: Inline assembly instructions require the following SPIR-V extension: SPV_INTEL_inline_assembly
+
+; CHECK: OpCapability AsmINTEL
+; CHECK: OpExtension "SPV_INTEL_inline_assembly"
+
+; CHECK-COUNT-8: OpDecorate %[[#]] SideEffectsINTEL
+
+; CHECK-DAG: %[[#VoidTy:]] = OpTypeVoid
+; CHECK-DAG: %[[#Int8Ty:]] = OpTypeInt 8 0
+; CHECK-DAG: %[[#Int32Ty:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#Int64Ty:]] = OpTypeInt 64 0
+; CHECK-DAG: %[[#HalfTy:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#FloatTy:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#DoubleTy:]] = OpTypeFloat 64
+
+; CHECK-DAG: OpTypeFunction %[[#VoidTy]] %[[#]] %[[#]] %[[#]] %[[#Int64Ty]]
+; CHECK-DAG: %[[#Fun1Ty:]] = OpTypeFunction %[[#VoidTy]]
+; CHECK-DAG: %[[#Fun2Ty:]] = OpTypeFunction %[[#Int32Ty]]
+; CHECK-DAG: %[[#Fun3Ty:]] = OpTypeFunction %[[#Int32Ty]] %[[#Int32Ty]]
+; CHECK-DAG: %[[#Fun4Ty:]] = OpTypeFunction %[[#FloatTy]] %[[#FloatTy]]
+; CHECK-DAG: %[[#Fun5Ty:]] = OpTypeFunction %[[#HalfTy]] %[[#FloatTy]] %[[#FloatTy]]
+; CHECK-DAG: %[[#Fun6Ty:]] = OpTypeFunction %[[#Int8Ty]] %[[#FloatTy]] %[[#Int32Ty]] %[[#Int8Ty]]
+; CHECK-DAG: %[[#Fun7Ty:]] = OpTypeFunction %[[#Int64Ty]] %[[#Int64Ty]] %[[#Int32Ty]] %[[#Int8Ty]]
+; CHECK-DAG: %[[#Fun8Ty:]] = OpTypeFunction %[[#VoidTy]] %[[#Int32Ty]] %[[#DoubleTy]]
+
+; CHECK-DAG: %[[#Const2:]] = OpConstant %[[#FloatTy]] 2
+; CHECK-DAG: %[[#Const123:]] = OpConstant %[[#Int32Ty]] 123
+; CHECK-DAG: %[[#Const42:]] = OpConstant %[[#DoubleTy:]] 42
+
+; CHECK: %[[#Dialect:]] = OpAsmTargetINTEL "spirv64-unknown-unknown"
+; CHECK-NO: OpAsmTargetINTEL
+
+; CHECK: %[[#Asm1:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "" ""
+; CHECK: %[[#Asm2:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "nop" ""
+; CHECK: %[[#Asm3:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun1Ty]] %[[#Dialect]] "" "~{cc},~{memory}"
+; CHECK: %[[#Asm4:]] = OpAsmINTEL %[[#Int32Ty]] %[[#Fun2Ty:]] %[[#Dialect]] "clobber_out $0" "=&r"
+; CHECK: %[[#Asm5:]] = OpAsmINTEL %[[#Int32Ty]] %[[#Fun3Ty]] %[[#Dialect]] "icmd $0 $1" "=r,r"
+; CHECK: %[[#Asm6:]] = OpAsmINTEL %[[#FloatTy]] %[[#Fun4Ty]] %[[#Dialect]] "fcmd $0 $1" "=r,r"
+; CHECK: %[[#Asm7:]] = OpAsmINTEL %[[#HalfTy]] %[[#Fun5Ty]] %[[#Dialect]] "fcmdext $0 $1 $2" "=r,r,r"
+; CHECK: %[[#Asm8:]] = OpAsmINTEL %[[#Int8Ty]] %[[#Fun6Ty]] %[[#Dialect]] "cmdext $0 $3 $1 $2" "=r,r,r,r"
+; CHECK: %[[#Asm9:]] = OpAsmINTEL %[[#Int64Ty]] %[[#Fun7Ty]] %[[#Dialect]] "icmdext $0 $3 $1 $2" "=r,r,r,r"
+; CHECK: %[[#Asm10:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun8Ty]] %[[#Dialect]] "constcmd $0 $1" "r,r"
+; CHECK: %[[#Asm11:]] = OpAsmINTEL %[[#VoidTy]] %[[#Fun8Ty]] %[[#Dialect]] "constcmd $0 $1" "i,i"
+; CHECK-NO: OpAsmINTEL
+
+; CHECK: OpFunction
+; CHECK: OpAsmCallINTEL %[[#VoidTy]] %[[#Asm1]]
+; CHECK: OpAsmCallINTEL %[[#VoidTy]] %[[#Asm2]]
+; CHECK: OpAsmCallINTEL %[[#VoidTy]] %[[#Asm3]]
+; CHECK: OpAsmCallINTEL %[[#Int32Ty]] %[[#Asm4]]
+; CHECK: OpAsmCallINTEL %[[#Int32Ty]] %[[#Asm5]] %[[#]]
+; CHECK: OpAsmCallINTEL %[[#FloatTy]] %[[#Asm6]] %[[#]]
+; CHECK: OpAsmCallINTEL %[[#HalfTy]] %[[#Asm7]] %[[#Const2]] %[[#]]
+; CHECK: OpAsmCallINTEL %[[#Int8Ty]] %[[#Asm8]] %[[#]] %[[#Const123]] %[[#]]
+; CHECK: OpAsmCallINTEL %[[#Int64Ty]] %[[#Asm9]] %[[#]] %[[#]] %[[#]]
+; CHECK: OpAsmCallINTEL %[[#VoidTy]] %[[#Asm10]] %[[#Const123]] %[[#Const42]]
+; CHECK: OpAsmCallINTEL %[[#VoidTy]] %[[#Asm11]] %[[#Const123]] %[[#Const42]]
+; CHECK-NO: OpAsmCallINTEL
+
+define spir_kernel void @foo(ptr addrspace(1) %_arg_int, ptr addrspace(1) %_arg_float, ptr addrspace(1) %_arg_half, i64 %_lng) {
+  %i1 = load i32, ptr addrspace(1) %_arg_int
+  %i2 = load i8, ptr addrspace(1) %_arg_int
+  %f1 = load float, ptr addrspace(1) %_arg_float
+  %h1 = load half, ptr addrspace(1) %_arg_half
+  ; inline asm
+  call void asm sideeffect "", ""()
+  call void asm sideeffect "nop", ""()
+  call void asm sideeffect "", "~{cc},~{memory}"()
+  %res_i0 = call i32 asm "clobber_out $0", "=&r"()
+  store i32 %res_i0, ptr addrspace(1) %_arg_int
+  ; inline asm: integer
+  %res_i1 = call i32 asm sideeffect "icmd $0 $1", "=r,r"(i32 %i1)
+  store i32 %res_i1, ptr addrspace(1) %_arg_int
+  ; inline asm: float
+  %res_f1 = call float asm sideeffect "fcmd $0 $1", "=r,r"(float %f1)
+  store float %res_f1, ptr addrspace(1) %_arg_float
+  ; inline asm: mixed floats
+  %res_f2 = call half asm sideeffect "fcmdext $0 $1 $2", "=r,r,r"(float 2.0, float %f1)
+  store half %res_f2, ptr addrspace(1) %_arg_half
+  ; inline asm: mixed operands of different types
+  call i8 asm sideeffect "cmdext $0 $3 $1 $2", "=r,r,r,r"(float %f1, i32 123, i8 %i2)
+  ; inline asm: mixed integers
+  %res_i2 = call i64 asm sideeffect "icmdext $0 $3 $1 $2", "=r,r,r,r"(i64 %_lng, i32 %i1, i8 %i2)
+  store i64 %res_i2, ptr addrspace(1) %_arg_int
+  ; inline asm: constant arguments, misc constraints
+  call void asm "constcmd $0 $1", "r,r"(i32 123, double 42.0)
+  call void asm "constcmd $0 $1", "i,i"(i32 123, double 42.0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
new file mode 100644
index 000000000000..e219f61b5c6e
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
@@ -0,0 +1,59 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_shader_clock %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_shader_clock %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: clock_read_device: the builtin requires the following SPIR-V extension: SPV_KHR_shader_clock
+
+; CHECK: OpCapability ShaderClockKHR
+; CHECK: OpExtension "SPV_KHR_shader_clock"
+; CHECK-DAG: [[uint:%[a-z0-9_]+]] = OpTypeInt 32
+; CHECK-DAG: [[ulong:%[a-z0-9_]+]] = OpTypeInt 64
+; CHECK-DAG: [[v2uint:%[a-z0-9_]+]] = OpTypeVector [[uint]] 2
+; CHECK-DAG: [[uint_1:%[a-z0-9_]+]] = OpConstant [[uint]] 1
+; CHECK-DAG: [[uint_2:%[a-z0-9_]+]] = OpConstant [[uint]] 2
+; CHECK-DAG: [[uint_3:%[a-z0-9_]+]] = OpConstant [[uint]] 3
+; CHECK: OpReadClockKHR [[ulong]] [[uint_1]]
+; CHECK: OpReadClockKHR [[ulong]] [[uint_2]]
+; CHECK: OpReadClockKHR [[ulong]] [[uint_3]]
+; CHECK: OpReadClockKHR [[v2uint]] [[uint_1]]
+; CHECK: OpReadClockKHR [[v2uint]] [[uint_2]]
+; CHECK: OpReadClockKHR [[v2uint]] [[uint_3]]
+
+define dso_local spir_kernel void @test_clocks(ptr addrspace(1) nocapture noundef writeonly align 8 %out64, ptr addrspace(1) nocapture noundef writeonly align 8 %outv2) {
+entry:
+  %call = tail call spir_func i64 @_Z17clock_read_devicev()
+  store i64 %call, ptr addrspace(1) %out64, align 8
+  %call1 = tail call spir_func i64 @_Z21clock_read_work_groupv()
+  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %out64, i32 8
+  store i64 %call1, ptr addrspace(1) %arrayidx2, align 8
+  %call3 = tail call spir_func i64 @_Z20clock_read_sub_groupv()
+  %arrayidx4 = getelementptr inbounds i8, ptr addrspace(1) %out64, i32 16
+  store i64 %call3, ptr addrspace(1) %arrayidx4, align 8
+  %call5 = tail call spir_func <2 x i32> @_Z22clock_read_hilo_devicev()
+  store <2 x i32> %call5, ptr addrspace(1) %outv2, align 8
+  %call7 = tail call spir_func <2 x i32> @_Z26clock_read_hilo_work_groupv()
+  %arrayidx8 = getelementptr inbounds i8, ptr addrspace(1) %outv2, i32 8
+  store <2 x i32> %call7, ptr addrspace(1) %arrayidx8, align 8
+  %call9 = tail call spir_func <2 x i32> @_Z25clock_read_hilo_sub_groupv()
+  %arrayidx10 = getelementptr inbounds i8, ptr addrspace(1) %outv2, i32 16
+  store <2 x i32> %call9, ptr addrspace(1) %arrayidx10, align 8
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func i64 @_Z17clock_read_devicev() local_unnamed_addr
+
+; Function Attrs: convergent nounwind
+declare spir_func i64 @_Z21clock_read_work_groupv() local_unnamed_addr
+
+; Function Attrs: convergent nounwind
+declare spir_func i64 @_Z20clock_read_sub_groupv() local_unnamed_addr
+
+; Function Attrs: convergent nounwind
+declare spir_func <2 x i32> @_Z22clock_read_hilo_devicev() local_unnamed_addr
+
+; Function Attrs: convergent nounwind
+declare spir_func <2 x i32> @_Z26clock_read_hilo_work_groupv() local_unnamed_addr
+
+; Function Attrs: convergent nounwind
+declare spir_func <2 x i32> @_Z25clock_read_hilo_sub_groupv() local_unnamed_addr
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
index 83d7275358ce..3300d46bf856 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll
@@ -130,26 +130,26 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x) {
 ; CHECK-NEXT:    vmov.i64 q1, #0xffff
 ; CHECK-NEXT:    vand q2, q2, q1
 ; CHECK-NEXT:    vmov.u16 r3, q0[2]
-; CHECK-NEXT:    vmov r0, r1, d5
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov r1, r2, d4
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov.u16 r1, q0[3]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r1
 ; CHECK-NEXT:    vmov.u16 r3, q0[4]
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov.u16 r1, q0[5]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r1
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov r2, r3, d5
-; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov r1, r3, d5
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    vmov.u16 r2, q0[7]
-; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.u16 r3, q0[6]
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vand q0, q0, q1
@@ -228,8 +228,8 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i64 q1, #0xffff
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r0, r1, d1
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r2, r1, d0
 ; CHECK-NEXT:    add r0, r2
 ; CHECK-NEXT:    bx lr
 entry:
@@ -397,26 +397,26 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
 ; CHECK-NEXT:    vmov.i64 q1, #0xff
 ; CHECK-NEXT:    vand q2, q2, q1
 ; CHECK-NEXT:    vmov.u8 r3, q0[2]
-; CHECK-NEXT:    vmov r0, r1, d5
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[3]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov r1, r2, d4
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov.u8 r1, q0[3]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r1
 ; CHECK-NEXT:    vmov.u8 r3, q0[4]
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[5]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov.u8 r1, q0[5]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r1
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov r2, r3, d5
-; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov r1, r3, d5
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q0[7]
-; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[6]
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vand q2, q2, q1
@@ -540,26 +540,26 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_zext(<8 x i8> %x) {
 ; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
 ; CHECK-NEXT:    vmov.u16 r3, q0[2]
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r0, r1, d5
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov r1, r2, d4
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov.u16 r1, q0[3]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r1
 ; CHECK-NEXT:    vmov.u16 r3, q0[4]
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov.u16 r1, q0[5]
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r1
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov r2, r3, d5
-; CHECK-NEXT:    adds r0, r0, r2
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    add r0, r1
+; CHECK-NEXT:    vmov r1, r3, d5
+; CHECK-NEXT:    adds r0, r0, r1
+; CHECK-NEXT:    adc.w r1, r2, r3
 ; CHECK-NEXT:    vmov.u16 r2, q0[7]
-; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vmov.u16 r3, q0[6]
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vand q0, q0, q1
@@ -648,8 +648,8 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i64 q1, #0xff
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r0, r1, d1
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r2, r1, d0
 ; CHECK-NEXT:    add r0, r2
 ; CHECK-NEXT:    bx lr
 entry:
@@ -834,8 +834,8 @@ define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) {
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vmov.i64 q1, #0xffff
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, r12, d5
-; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r3, r12, d4
 ; CHECK-NEXT:    add.w lr, r3, r2
 ; CHECK-NEXT:    vmov.u16 r3, q0[3]
 ; CHECK-NEXT:    vmov.u16 r2, q0[2]
@@ -943,8 +943,8 @@ define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i64 q1, #0xffff
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r2, r12, d1
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, r12, d0
 ; CHECK-NEXT:    add r2, r3
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adc.w r1, r1, r12
@@ -1130,8 +1130,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vmov.i64 q1, #0xff
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, r12, d5
-; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r3, r12, d4
 ; CHECK-NEXT:    add.w lr, r3, r2
 ; CHECK-NEXT:    vmov.u8 r3, q0[3]
 ; CHECK-NEXT:    vmov.u8 r2, q0[2]
@@ -1283,8 +1283,8 @@ define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) {
 ; CHECK-NEXT:    vmov.u16 r3, q0[0]
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vand q2, q2, q1
-; CHECK-NEXT:    vmov r2, r12, d5
-; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r3, r12, d4
 ; CHECK-NEXT:    add.w lr, r3, r2
 ; CHECK-NEXT:    vmov.u16 r3, q0[3]
 ; CHECK-NEXT:    vmov.u16 r2, q0[2]
@@ -1402,8 +1402,8 @@ define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.i64 q1, #0xff
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r2, r12, d1
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, r12, d0
 ; CHECK-NEXT:    add r2, r3
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adc.w r1, r1, r12
diff --git a/llvm/test/CodeGen/WebAssembly/fast-isel-call-indirect64.ll b/llvm/test/CodeGen/WebAssembly/fast-isel-call-indirect64.ll
deleted file mode 100644
index 8224c3bc4e37..000000000000
--- a/llvm/test/CodeGen/WebAssembly/fast-isel-call-indirect64.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s -fast-isel --mtriple=wasm64 -asm-verbose=false -wasm-keep-registers | FileCheck %s
-
-target triple = "wasm64"
-
-; Ensure fast isel also lowers function pointers to 32-bit.
-
-; CHECK:       local.get $push[[L0:[0-9]+]]=, 0
-; CHECK-NEXT:  i32.wrap_i64 $push[[L1:[0-9]+]]=, $pop[[L0]]
-; CHECK-NEXT:  call_indirect $pop[[L1]]
-
-define hidden void @f(ptr %g) {
-  call void %g()
-  ret void
-}
diff --git a/llvm/test/CodeGen/WebAssembly/function-pointer64.ll b/llvm/test/CodeGen/WebAssembly/function-pointer64.ll
index c7c90f6b7ac2..7f98d3e648bd 100644
--- a/llvm/test/CodeGen/WebAssembly/function-pointer64.ll
+++ b/llvm/test/CodeGen/WebAssembly/function-pointer64.ll
@@ -34,7 +34,6 @@ entry:
 ; CHECK:      .functype foo (i64) -> ()
 ; CHECK-NEXT: i32.const 1
 ; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i32.wrap_i64
 ; CHECK-NEXT: call_indirect (i32) -> ()
 ; REF:        call_indirect __indirect_function_table, (i32) -> ()
 
@@ -53,10 +52,10 @@ entry:
 ; YAML:      - Type:   CODE
 ; YAML:      - Type:   R_WASM_TABLE_INDEX_SLEB64
 ; YAML-NEXT:   Index:  0
-; YAML-NEXT:   Offset: 0x16
+; YAML-NEXT:   Offset: 0x15
 ; YAML:      - Type:   R_WASM_TABLE_INDEX_SLEB64
 ; YAML-NEXT:   Index:  0
-; YAML-NEXT:   Offset: 0x29
+; YAML-NEXT:   Offset: 0x28
 
 ; YAML:      - Type:   DATA
 ; YAML:      - Type:   R_WASM_TABLE_INDEX_I64
diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll
index 89e9c42637c1..d9d3f6be800f 100644
--- a/llvm/test/CodeGen/WebAssembly/half-precision.ll
+++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s --mtriple=wasm32-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+half-precision | FileCheck %s
-; RUN: llc < %s --mtriple=wasm64-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+half-precision | FileCheck %s
+; RUN: llc < %s --mtriple=wasm32-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+half-precision,+simd128 | FileCheck %s
+; RUN: llc < %s --mtriple=wasm64-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+half-precision,+simd128 | FileCheck %s
 
 declare float @llvm.wasm.loadf32.f16(ptr)
 declare void @llvm.wasm.storef16.f32(float, ptr)
@@ -19,3 +19,19 @@ define void @stf16_32(float %v, ptr %p) {
   tail call void @llvm.wasm.storef16.f32(float %v, ptr %p)
   ret void
 }
+
+; CHECK-LABEL: splat_v8f16:
+; CHECK:       f16x8.splat $push0=, $0
+; CHECK-NEXT:  return $pop0
+define <8 x half> @splat_v8f16(float %x) {
+  %v = call <8 x half> @llvm.wasm.splat.f16x8(float %x)
+  ret <8 x half> %v
+}
+
+; CHECK-LABEL: extract_lane_v8f16:
+; CHECK:       f16x8.extract_lane $push0=, $0, 1
+; CHECK-NEXT:  return $pop0
+define float @extract_lane_v8f16(<8 x half> %v) {
+  %r = call float @llvm.wasm.extract.lane.f16x8(<8 x half> %v, i32 1)
+  ret float %r
+}
diff --git a/llvm/test/CodeGen/X86/abds-vector-128.ll b/llvm/test/CodeGen/X86/abds-vector-128.ll
index 3143bf619065..bcb42002fb08 100644
--- a/llvm/test/CodeGen/X86/abds-vector-128.ll
+++ b/llvm/test/CodeGen/X86/abds-vector-128.ll
@@ -12,14 +12,12 @@
 define <16 x i8> @abd_ext_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-LABEL: abd_ext_v16i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psubb %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
 ; SSE2-NEXT:    psubb %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm3
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubb %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_ext_v16i8:
@@ -47,14 +45,12 @@ define <16 x i8> @abd_ext_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 define <16 x i8> @abd_ext_v16i8_undef(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-LABEL: abd_ext_v16i8_undef:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psubb %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
 ; SSE2-NEXT:    psubb %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm3
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubb %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_ext_v16i8_undef:
@@ -128,14 +124,12 @@ define <8 x i16> @abd_ext_v8i16_undef(<8 x i16> %a, <8 x i16> %b) nounwind {
 define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-LABEL: abd_ext_v4i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm3
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_ext_v4i32:
@@ -163,14 +157,12 @@ define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-LABEL: abd_ext_v4i32_undef:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm3
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_ext_v4i32_undef:
@@ -198,61 +190,48 @@ define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind {
 define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-LABEL: abd_ext_v2i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm2, %rax
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    sarq $63, %rcx
-; SSE2-NEXT:    movq %xmm0, %rdx
-; SSE2-NEXT:    movq %rdx, %rsi
-; SSE2-NEXT:    sarq $63, %rsi
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rdi
-; SSE2-NEXT:    movq %rdi, %r8
-; SSE2-NEXT:    sarq $63, %r8
-; SSE2-NEXT:    movq %xmm1, %r9
-; SSE2-NEXT:    movq %r9, %r10
-; SSE2-NEXT:    sarq $63, %r10
-; SSE2-NEXT:    subq %r9, %rdx
-; SSE2-NEXT:    sbbq %r10, %rsi
-; SSE2-NEXT:    subq %rdi, %rax
-; SSE2-NEXT:    sbbq %r8, %rcx
-; SSE2-NEXT:    sarq $63, %rcx
-; SSE2-NEXT:    xorq %rcx, %rax
-; SSE2-NEXT:    subq %rcx, %rax
-; SSE2-NEXT:    sarq $63, %rsi
-; SSE2-NEXT:    xorq %rsi, %rdx
-; SSE2-NEXT:    subq %rsi, %rdx
-; SSE2-NEXT:    movq %rdx, %xmm0
-; SSE2-NEXT:    movq %rax, %xmm1
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_ext_v2i64:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
 ; SSE42-NEXT:    pcmpgtq %xmm1, %xmm2
-; SSE42-NEXT:    movdqa %xmm0, %xmm3
-; SSE42-NEXT:    psubq %xmm1, %xmm3
-; SSE42-NEXT:    psubq %xmm0, %xmm1
+; SSE42-NEXT:    psubq %xmm1, %xmm0
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    psubq %xmm0, %xmm2
 ; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: abd_ext_v2i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v2i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v2i64:
@@ -272,61 +251,48 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-LABEL: abd_ext_v2i64_undef:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm2, %rax
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    sarq $63, %rcx
-; SSE2-NEXT:    movq %xmm0, %rdx
-; SSE2-NEXT:    movq %rdx, %rsi
-; SSE2-NEXT:    sarq $63, %rsi
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rdi
-; SSE2-NEXT:    movq %rdi, %r8
-; SSE2-NEXT:    sarq $63, %r8
-; SSE2-NEXT:    movq %xmm1, %r9
-; SSE2-NEXT:    movq %r9, %r10
-; SSE2-NEXT:    sarq $63, %r10
-; SSE2-NEXT:    subq %r9, %rdx
-; SSE2-NEXT:    sbbq %r10, %rsi
-; SSE2-NEXT:    subq %rdi, %rax
-; SSE2-NEXT:    sbbq %r8, %rcx
-; SSE2-NEXT:    sarq $63, %rcx
-; SSE2-NEXT:    xorq %rcx, %rax
-; SSE2-NEXT:    subq %rcx, %rax
-; SSE2-NEXT:    sarq $63, %rsi
-; SSE2-NEXT:    xorq %rsi, %rdx
-; SSE2-NEXT:    subq %rsi, %rdx
-; SSE2-NEXT:    movq %rdx, %xmm0
-; SSE2-NEXT:    movq %rax, %xmm1
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_ext_v2i64_undef:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
 ; SSE42-NEXT:    pcmpgtq %xmm1, %xmm2
-; SSE42-NEXT:    movdqa %xmm0, %xmm3
-; SSE42-NEXT:    psubq %xmm1, %xmm3
-; SSE42-NEXT:    psubq %xmm0, %xmm1
+; SSE42-NEXT:    psubq %xmm1, %xmm0
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    psubq %xmm0, %xmm2
 ; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: abd_ext_v2i64_undef:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v2i64_undef:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v2i64_undef:
@@ -350,14 +316,12 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
 define <16 x i8> @abd_minmax_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-LABEL: abd_minmax_v16i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psubb %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
 ; SSE2-NEXT:    psubb %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm3
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubb %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_minmax_v16i8:
@@ -404,14 +368,12 @@ define <8 x i16> @abd_minmax_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 define <4 x i32> @abd_minmax_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-LABEL: abd_minmax_v4i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm3
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_minmax_v4i32:
@@ -445,47 +407,40 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    pandn %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm4
-; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm2
 ; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_minmax_v2i64:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
 ; SSE42-NEXT:    pcmpgtq %xmm1, %xmm2
-; SSE42-NEXT:    movdqa %xmm0, %xmm3
-; SSE42-NEXT:    psubq %xmm1, %xmm3
-; SSE42-NEXT:    psubq %xmm0, %xmm1
+; SSE42-NEXT:    psubq %xmm1, %xmm0
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    psubq %xmm0, %xmm2
 ; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: abd_minmax_v2i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_minmax_v2i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_minmax_v2i64:
@@ -507,14 +462,12 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 define <16 x i8> @abd_cmp_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE2-LABEL: abd_cmp_v16i8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psubb %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
 ; SSE2-NEXT:    psubb %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm3
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubb %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_cmp_v16i8:
@@ -563,14 +516,12 @@ define <8 x i16> @abd_cmp_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 define <4 x i32> @abd_cmp_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-LABEL: abd_cmp_v4i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm3
-; SSE2-NEXT:    por %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_cmp_v4i32:
@@ -598,9 +549,9 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-LABEL: abd_cmp_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
 ; SSE2-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
@@ -609,12 +560,9 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-NEXT:    pand %xmm5, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psubq %xmm1, %xmm3
-; SSE2-NEXT:    psubq %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm0, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -622,28 +570,26 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
 ; SSE42-NEXT:    pcmpgtq %xmm1, %xmm2
-; SSE42-NEXT:    movdqa %xmm0, %xmm3
-; SSE42-NEXT:    psubq %xmm1, %xmm3
-; SSE42-NEXT:    psubq %xmm0, %xmm1
+; SSE42-NEXT:    psubq %xmm1, %xmm0
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    psubq %xmm0, %xmm2
 ; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: abd_cmp_v2i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_cmp_v2i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_cmp_v2i64:
@@ -790,50 +736,52 @@ define <2 x i64> @abd_subnsw_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 define <2 x i64> @abd_cmp_v2i64_multiuse_cmp(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-LABEL: abd_cmp_v2i64_multiuse_cmp:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    psubq %xmm1, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    psubq %xmm0, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm4, %xmm0
+; SSE2-NEXT:    psubq %xmm0, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE2-NEXT:    pand %xmm6, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    por %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm1, %xmm3
 ; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    pandn %xmm2, %xmm1
-; SSE2-NEXT:    por %xmm3, %xmm1
-; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    paddq %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_cmp_v2i64_multiuse_cmp:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
-; SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
-; SSE42-NEXT:    movdqa %xmm2, %xmm3
+; SSE42-NEXT:    pcmpgtq %xmm1, %xmm2
+; SSE42-NEXT:    movdqa %xmm0, %xmm3
 ; SSE42-NEXT:    psubq %xmm1, %xmm3
-; SSE42-NEXT:    movdqa %xmm1, %xmm4
-; SSE42-NEXT:    psubq %xmm2, %xmm4
-; SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm4
-; SSE42-NEXT:    pcmpgtq %xmm2, %xmm1
+; SSE42-NEXT:    pxor %xmm2, %xmm3
+; SSE42-NEXT:    psubq %xmm3, %xmm2
+; SSE42-NEXT:    pcmpgtq %xmm0, %xmm1
 ; SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE42-NEXT:    pxor %xmm1, %xmm0
-; SSE42-NEXT:    paddq %xmm4, %xmm0
+; SSE42-NEXT:    paddq %xmm2, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: abd_cmp_v2i64_multiuse_cmp:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm4
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm3, %xmm4, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
@@ -844,8 +792,8 @@ define <2 x i64> @abd_cmp_v2i64_multiuse_cmp(<2 x i64> %a, <2 x i64> %b) nounwin
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm4
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm3, %xmm4, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm3, %xmm3
+; AVX2-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/abds-vector-256.ll b/llvm/test/CodeGen/X86/abds-vector-256.ll
index 78190d2cb7d8..cc63ad04c08a 100644
--- a/llvm/test/CodeGen/X86/abds-vector-256.ll
+++ b/llvm/test/CodeGen/X86/abds-vector-256.ll
@@ -223,22 +223,22 @@ define <4 x i64> @abd_ext_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
-; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm5
-; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm3, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm3
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v4i64:
@@ -261,22 +261,22 @@ define <4 x i64> @abd_ext_v4i64_undef(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
-; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm5
-; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm3, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v4i64_undef:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm3
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v4i64_undef:
@@ -402,22 +402,22 @@ define <4 x i64> @abd_minmax_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
-; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm5
-; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm3, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_minmax_v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm3
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_minmax_v4i64:
@@ -544,22 +544,22 @@ define <4 x i64> @abd_cmp_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
-; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm5
-; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm3, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_cmp_v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm3
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_cmp_v4i64:
diff --git a/llvm/test/CodeGen/X86/abdu-vector-128.ll b/llvm/test/CodeGen/X86/abdu-vector-128.ll
index 0c33e8973c2d..78b315a3773e 100644
--- a/llvm/test/CodeGen/X86/abdu-vector-128.ll
+++ b/llvm/test/CodeGen/X86/abdu-vector-128.ll
@@ -125,12 +125,10 @@ define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NEXT:    pxor %xmm2, %xmm3
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    psubd %xmm0, %xmm3
 ; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_ext_v4i32:
@@ -163,12 +161,10 @@ define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NEXT:    pxor %xmm2, %xmm3
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    psubd %xmm0, %xmm3
 ; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_ext_v4i32_undef:
@@ -196,27 +192,22 @@ define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind {
 define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-LABEL: abd_ext_v2i64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm2, %rax
-; SSE2-NEXT:    movq %xmm0, %rcx
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rdx
-; SSE2-NEXT:    movq %xmm1, %rsi
-; SSE2-NEXT:    xorl %edi, %edi
-; SSE2-NEXT:    subq %rsi, %rcx
-; SSE2-NEXT:    movl $0, %esi
-; SSE2-NEXT:    sbbq %rsi, %rsi
-; SSE2-NEXT:    subq %rdx, %rax
-; SSE2-NEXT:    sbbq %rdi, %rdi
-; SSE2-NEXT:    sarq $63, %rdi
-; SSE2-NEXT:    xorq %rdi, %rax
-; SSE2-NEXT:    subq %rdi, %rax
-; SSE2-NEXT:    sarq $63, %rsi
-; SSE2-NEXT:    xorq %rsi, %rcx
-; SSE2-NEXT:    subq %rsi, %rcx
-; SSE2-NEXT:    movq %rcx, %xmm0
-; SSE2-NEXT:    movq %rax, %xmm1
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_ext_v2i64:
@@ -226,12 +217,10 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE42-NEXT:    pxor %xmm2, %xmm3
 ; SSE42-NEXT:    pxor %xmm0, %xmm2
 ; SSE42-NEXT:    pcmpgtq %xmm3, %xmm2
-; SSE42-NEXT:    movdqa %xmm0, %xmm3
-; SSE42-NEXT:    psubq %xmm1, %xmm3
-; SSE42-NEXT:    psubq %xmm0, %xmm1
+; SSE42-NEXT:    psubq %xmm1, %xmm0
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    psubq %xmm0, %xmm2
 ; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: abd_ext_v2i64:
@@ -241,9 +230,9 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v2i64:
@@ -252,9 +241,9 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v2i64:
@@ -274,27 +263,22 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-LABEL: abd_ext_v2i64_undef:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; SSE2-NEXT:    movq %xmm2, %rax
-; SSE2-NEXT:    movq %xmm0, %rcx
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE2-NEXT:    movq %xmm0, %rdx
-; SSE2-NEXT:    movq %xmm1, %rsi
-; SSE2-NEXT:    xorl %edi, %edi
-; SSE2-NEXT:    subq %rsi, %rcx
-; SSE2-NEXT:    movl $0, %esi
-; SSE2-NEXT:    sbbq %rsi, %rsi
-; SSE2-NEXT:    subq %rdx, %rax
-; SSE2-NEXT:    sbbq %rdi, %rdi
-; SSE2-NEXT:    sarq $63, %rdi
-; SSE2-NEXT:    xorq %rdi, %rax
-; SSE2-NEXT:    subq %rdi, %rax
-; SSE2-NEXT:    sarq $63, %rsi
-; SSE2-NEXT:    xorq %rsi, %rcx
-; SSE2-NEXT:    subq %rsi, %rcx
-; SSE2-NEXT:    movq %rcx, %xmm0
-; SSE2-NEXT:    movq %rax, %xmm1
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_ext_v2i64_undef:
@@ -304,12 +288,10 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE42-NEXT:    pxor %xmm2, %xmm3
 ; SSE42-NEXT:    pxor %xmm0, %xmm2
 ; SSE42-NEXT:    pcmpgtq %xmm3, %xmm2
-; SSE42-NEXT:    movdqa %xmm0, %xmm3
-; SSE42-NEXT:    psubq %xmm1, %xmm3
-; SSE42-NEXT:    psubq %xmm0, %xmm1
+; SSE42-NEXT:    psubq %xmm1, %xmm0
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    psubq %xmm0, %xmm2
 ; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: abd_ext_v2i64_undef:
@@ -319,9 +301,9 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v2i64_undef:
@@ -330,9 +312,9 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v2i64_undef:
@@ -411,12 +393,10 @@ define <4 x i32> @abd_minmax_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NEXT:    pxor %xmm2, %xmm3
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    psubd %xmm0, %xmm3
 ; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_minmax_v4i32:
@@ -450,19 +430,14 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    pandn %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm4
-; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm2
 ; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_minmax_v2i64:
@@ -472,12 +447,10 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE42-NEXT:    pxor %xmm2, %xmm3
 ; SSE42-NEXT:    pxor %xmm0, %xmm2
 ; SSE42-NEXT:    pcmpgtq %xmm3, %xmm2
-; SSE42-NEXT:    movdqa %xmm0, %xmm3
-; SSE42-NEXT:    psubq %xmm1, %xmm3
-; SSE42-NEXT:    psubq %xmm0, %xmm1
+; SSE42-NEXT:    psubq %xmm1, %xmm0
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    psubq %xmm0, %xmm2
 ; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: abd_minmax_v2i64:
@@ -487,9 +460,9 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_minmax_v2i64:
@@ -498,9 +471,9 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_minmax_v2i64:
@@ -579,12 +552,10 @@ define <4 x i32> @abd_cmp_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE2-NEXT:    pxor %xmm2, %xmm3
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    psubd %xmm0, %xmm3
 ; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubd %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_cmp_v4i32:
@@ -612,9 +583,9 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-LABEL: abd_cmp_v2i64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
 ; SSE2-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NEXT:    pxor %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
@@ -623,12 +594,9 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE2-NEXT:    pand %xmm5, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psubq %xmm1, %xmm3
-; SSE2-NEXT:    psubq %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    psubq %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm0, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -639,12 +607,10 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE42-NEXT:    pxor %xmm2, %xmm3
 ; SSE42-NEXT:    pxor %xmm0, %xmm2
 ; SSE42-NEXT:    pcmpgtq %xmm3, %xmm2
-; SSE42-NEXT:    movdqa %xmm0, %xmm3
-; SSE42-NEXT:    psubq %xmm1, %xmm3
-; SSE42-NEXT:    psubq %xmm0, %xmm1
+; SSE42-NEXT:    psubq %xmm1, %xmm0
+; SSE42-NEXT:    pxor %xmm2, %xmm0
+; SSE42-NEXT:    psubq %xmm0, %xmm2
 ; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: abd_cmp_v2i64:
@@ -654,9 +620,9 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_cmp_v2i64:
@@ -665,9 +631,9 @@ define <2 x i64> @abd_cmp_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_cmp_v2i64:
@@ -692,63 +658,59 @@ define <2 x i64> @abd_cmp_v2i64_multiuse_cmp(<2 x i64> %a, <2 x i64> %b) nounwin
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    psubq %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    psubq %xmm0, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT:    pxor %xmm3, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
 ; SSE2-NEXT:    por %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    pandn %xmm3, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm0, %xmm2
-; SSE2-NEXT:    paddq %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psubq %xmm2, %xmm0
+; SSE2-NEXT:    paddq %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: abd_cmp_v2i64_multiuse_cmp:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
 ; SSE42-NEXT:    psubq %xmm1, %xmm2
-; SSE42-NEXT:    movdqa %xmm1, %xmm3
-; SSE42-NEXT:    psubq %xmm0, %xmm3
-; SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
-; SSE42-NEXT:    pxor %xmm4, %xmm1
-; SSE42-NEXT:    pxor %xmm4, %xmm0
+; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT:    pxor %xmm3, %xmm1
+; SSE42-NEXT:    pxor %xmm3, %xmm0
 ; SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
-; SSE42-NEXT:    paddq %xmm3, %xmm0
+; SSE42-NEXT:    pxor %xmm0, %xmm2
+; SSE42-NEXT:    movdqa %xmm0, %xmm1
+; SSE42-NEXT:    psubq %xmm2, %xmm1
+; SSE42-NEXT:    paddq %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
 ; AVX1-LABEL: abd_cmp_v2i64_multiuse_cmp:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm3
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    # xmm4 = mem[0,0]
-; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    # xmm3 = mem[0,0]
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm0, %xmm2, %xmm3, %xmm1
+; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_cmp_v2i64_multiuse_cmp:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm3
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vpxor %xmm4, %xmm1, %xmm1
-; AVX2-NEXT:    vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm3, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vblendvpd %xmm0, %xmm2, %xmm3, %xmm1
+; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm1
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/abdu-vector-256.ll b/llvm/test/CodeGen/X86/abdu-vector-256.ll
index 884515cfedd0..080fb779fecb 100644
--- a/llvm/test/CodeGen/X86/abdu-vector-256.ll
+++ b/llvm/test/CodeGen/X86/abdu-vector-256.ll
@@ -227,15 +227,15 @@ define <4 x i64> @abd_ext_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
 ; AVX1-NEXT:    vpxor %xmm3, %xmm5, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm6
-; AVX1-NEXT:    vpsubq %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
 ; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm3, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -245,9 +245,9 @@ define <4 x i64> @abd_ext_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
 ; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm3
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v4i64:
@@ -274,15 +274,15 @@ define <4 x i64> @abd_ext_v4i64_undef(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
 ; AVX1-NEXT:    vpxor %xmm3, %xmm5, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm6
-; AVX1-NEXT:    vpsubq %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
 ; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm3, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -292,9 +292,9 @@ define <4 x i64> @abd_ext_v4i64_undef(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
 ; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm3
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v4i64_undef:
@@ -424,15 +424,15 @@ define <4 x i64> @abd_minmax_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
 ; AVX1-NEXT:    vpxor %xmm3, %xmm5, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm6
-; AVX1-NEXT:    vpsubq %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
 ; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm3, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -442,9 +442,9 @@ define <4 x i64> @abd_minmax_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
 ; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm3
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_minmax_v4i64:
@@ -575,15 +575,15 @@ define <4 x i64> @abd_cmp_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
 ; AVX1-NEXT:    vpxor %xmm3, %xmm5, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm6
-; AVX1-NEXT:    vpsubq %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
 ; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm3, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -593,9 +593,9 @@ define <4 x i64> @abd_cmp_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
 ; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm3
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_cmp_v4i64:
diff --git a/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll b/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
index bb86f307afa8..b4ba23934d54 100644
--- a/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
+++ b/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
@@ -48,5 +48,5 @@ entry:
 ; Function Attrs: nounwind readnone
 declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32) #1
 
-attributes #0 = { nounwind readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="knl" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll
index 8d09497cefb1..77053e2c1bc9 100644
--- a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll
+++ b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll
@@ -268,30 +268,6 @@ define void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, ptr %stbuf)
   ret void
 }
 
-declare  void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, ptr , i32, i32);
-declare  void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, ptr , i32, i32);
-define void @prefetch(<8 x i64> %ind, ptr %base) {
-; CHECK-LABEL: prefetch:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
-; CHECK-NEXT:    vgatherpf0qps (%rdi,%zmm0,4) {%k1}
-; CHECK-NEXT:    kxorw %k0, %k0, %k1
-; CHECK-NEXT:    vgatherpf1qps (%rdi,%zmm0,4) {%k1}
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vscatterpf0qps (%rdi,%zmm0,2) {%k1}
-; CHECK-NEXT:    movb $120, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vscatterpf1qps (%rdi,%zmm0,2) {%k1}
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, ptr %base, i32 4, i32 3)
-  call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, ptr %base, i32 4, i32 2)
-  call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, ptr %base, i32 2, i32 3)
-  call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, ptr %base, i32 2, i32 2)
-  ret void
-}
-
 declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, ptr, <2 x i64>, i8, i32)
 
 define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, ptr %x1, <2 x i64> %x2, i8 %x3) {
diff --git a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
index acbf4387255c..df71e3c3afa5 100644
--- a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
+++ b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -265,30 +265,6 @@ define dso_local void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, p
   ret void
 }
 
-declare  void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, ptr , i32, i32);
-declare  void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, ptr , i32, i32);
-define dso_local void @prefetch(<8 x i64> %ind, ptr %base) {
-; CHECK-LABEL: prefetch:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
-; CHECK-NEXT:    vgatherpf0qps (%rdi,%zmm0,4) {%k1}
-; CHECK-NEXT:    kxorw %k0, %k0, %k1
-; CHECK-NEXT:    vgatherpf1qps (%rdi,%zmm0,4) {%k1}
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vscatterpf0qps (%rdi,%zmm0,2) {%k1}
-; CHECK-NEXT:    movb $120, %al
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vscatterpf1qps (%rdi,%zmm0,2) {%k1}
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, ptr %base, i32 4, i32 3)
-  call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, ptr %base, i32 4, i32 2)
-  call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, ptr %base, i32 2, i32 3)
-  call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, ptr %base, i32 2, i32 2)
-  ret void
-}
-
 define <2 x double> @test_int_x86_avx512_mask_gather3div2_df(<2 x double> %x0, ptr %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_gather3div2_df:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/avx512er-intrinsics.ll b/llvm/test/CodeGen/X86/avx512er-intrinsics.ll
deleted file mode 100644
index fa4025f76b57..000000000000
--- a/llvm/test/CodeGen/X86/avx512er-intrinsics.ll
+++ /dev/null
@@ -1,306 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512er --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512er --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
-
-define <16 x float> @test_rsqrt28_ps(<16 x float> %a0) {
-; CHECK-LABEL: test_rsqrt28_ps:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vrsqrt28ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test1_rsqrt28_ps(<16 x float> %a0, <16 x float> %a1) {
-; CHECK-LABEL: test1_rsqrt28_ps:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movw $6, %ax # encoding: [0x66,0xb8,0x06,0x00]
-; CHECK-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; CHECK-NEXT:    vrsqrt28ps {sae}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcc,0xc8]
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> %a1, i16 6, i32 8)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test2_rsqrt28_ps(<16 x float> %a0) {
-; CHECK-LABEL: test2_rsqrt28_ps:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movw $6, %ax # encoding: [0x66,0xb8,0x06,0x00]
-; CHECK-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; CHECK-NEXT:    vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 4)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test3_rsqrt28_ps(<16 x float> %a0) {
-; CHECK-LABEL: test3_rsqrt28_ps:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movw $6, %ax # encoding: [0x66,0xb8,0x06,0x00]
-; CHECK-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; CHECK-NEXT:    vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 6, i32 4)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test4_rsqrt28_ps(<16 x float> %a0) {
-; CHECK-LABEL: test4_rsqrt28_ps:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movw $6, %ax # encoding: [0x66,0xb8,0x06,0x00]
-; CHECK-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; CHECK-NEXT:    vrsqrt28ps {sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcc,0xc0]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 8)
-  ret <16 x float> %res
-}
-
-declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
-
-define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) {
-; CHECK-LABEL: test_rcp28_ps_512:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vrcp28ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
-
-define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) {
-; CHECK-LABEL: test_rcp28_pd_512:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vrcp28pd {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8)
-  ret <8 x double> %res
-}
-declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
-
-define <16 x float> @test_exp2_ps_512(<16 x float> %a0) {
-; CHECK-LABEL: test_exp2_ps_512:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vexp2ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xc8,0xc0]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
-
-define <8 x double> @test_exp2_pd_512(<8 x double> %a0) {
-; CHECK-LABEL: test_exp2_pd_512:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vexp2pd {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x18,0xc8,0xc0]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8)
-  ret <8 x double> %res
-}
-declare <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
-
-define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) {
-; CHECK-LABEL: test_rsqrt28_ss:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
-
-define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
-; CHECK-LABEL: test_rcp28_ss:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vrcp28ss {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]
-; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
-
-define <4 x float> @test_rcp28_ss_load(<4 x float> %a0, ptr %a1ptr) {
-; X86-LABEL: test_rcp28_ss_load:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    vrcp28ss (%eax), %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0xcb,0x00]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_rcp28_ss_load:
-; X64:       # %bb.0:
-; X64-NEXT:    vrcp28ss (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0xcb,0x07]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %a1 = load <4 x float>, ptr %a1ptr
-  %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_rsqrt28_ss_load(<4 x float> %a0, ptr %a1ptr) {
-; X86-LABEL: test_rsqrt28_ss_load:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    vrsqrt28ss (%eax), %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0xcd,0x00]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_rsqrt28_ss_load:
-; X64:       # %bb.0:
-; X64-NEXT:    vrsqrt28ss (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0xcd,0x07]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %a1 = load <4 x float>, ptr %a1ptr
-  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0, i8 %mask) {
-; X86-LABEL: test_rsqrt28_ss_maskz:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_rsqrt28_ss_maskz:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
-; X64-NEXT:    vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 %mask, i32 8) ;
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 %mask) {
-; X86-LABEL: test_rsqrt28_ss_mask:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1]
-; X86-NEXT:    vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_rsqrt28_ss_mask:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
-; X64-NEXT:    vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1]
-; X64-NEXT:    vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 %mask, i32 8) ;
-  ret <4 x float> %res
-}
-
-define <2 x double> @test_rcp28_sd_mask_load(<2 x double> %a0, ptr %a1ptr, <2 x double> %a2, i8 %mask) {
-; X86-LABEL: test_rcp28_sd_mask_load:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vrcp28sd %xmm0, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xcb,0xc8]
-; X86-NEXT:    vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_rcp28_sd_mask_load:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
-; X64-NEXT:    vrcp28sd %xmm0, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xcb,0xc8]
-; X64-NEXT:    vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %a1 = load <2 x double>, ptr %a1ptr
-  %res = call <2 x double> @llvm.x86.avx512.rcp28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> %a2, i8 %mask, i32 4) ;
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.avx512.rcp28.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
-
-define <2 x double> @test_rsqrt28_sd_maskz_load(<2 x double> %a0, ptr %a1ptr, i8 %mask) {
-; X86-LABEL: test_rsqrt28_sd_maskz_load:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vrsqrt28sd %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0xc0]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_rsqrt28_sd_maskz_load:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
-; X64-NEXT:    vrsqrt28sd %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0xc0]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %a1 = load <2 x double>, ptr %a1ptr
-  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 %mask, i32 4) ;
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0, i8 %mask) {
-; X86-LABEL: test_rsqrt28_sd_maskz:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_rsqrt28_sd_maskz:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
-; X64-NEXT:    vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 %mask, i32 8) ;
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_rsqrt28_sd_mask(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 %mask) {
-; X86-LABEL: test_rsqrt28_sd_mask:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    vrsqrt28sd {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0xcd,0xd1]
-; X86-NEXT:    vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_rsqrt28_sd_mask:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
-; X64-NEXT:    vrsqrt28sd {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0xcd,0xd1]
-; X64-NEXT:    vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 %mask, i32 8) ;
-  ret <2 x double> %res
-}
-
-declare <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
-
-define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, ptr %ptr, i8 %mask) {
-; X86-LABEL: test_rsqrt28_sd_maskz_mem:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    vrsqrt28sd (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x00]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_rsqrt28_sd_maskz_mem:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
-; X64-NEXT:    vrsqrt28sd (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x07]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %mem = load double , ptr %ptr, align 8
-  %mem_v = insertelement <2 x double> undef, double %mem, i32 0
-  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 %mask, i32 4) ;
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_rsqrt28_sd_maskz_mem_offset(<2 x double> %a0, ptr %ptr, i8 %mask) {
-; X86-LABEL: test_rsqrt28_sd_maskz_mem_offset:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08]
-; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-NEXT:    vrsqrt28sd 144(%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x40,0x12]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: test_rsqrt28_sd_maskz_mem_offset:
-; X64:       # %bb.0:
-; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
-; X64-NEXT:    vrsqrt28sd 144(%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x47,0x12]
-; X64-NEXT:    retq # encoding: [0xc3]
-  %ptr1 = getelementptr double, ptr %ptr, i32 18
-  %mem = load double , ptr %ptr1, align 8
-  %mem_v = insertelement <2 x double> undef, double %mem, i32 0
-  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 %mask, i32 4) ;
-  ret <2 x double> %res
-}
-
diff --git a/llvm/test/CodeGen/X86/coalescer-add-implicit-def-subreg-to-reg-regression.ll b/llvm/test/CodeGen/X86/coalescer-add-implicit-def-subreg-to-reg-regression.ll
new file mode 100644
index 000000000000..0e6cb7a3aff2
--- /dev/null
+++ b/llvm/test/CodeGen/X86/coalescer-add-implicit-def-subreg-to-reg-regression.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+; Not from issue 76416, but separate testcase reported on the same
+; regressing commit.
+define void @other_regression(i1 %cmp.not.i.i.i) {
+; CHECK-LABEL: other_regression:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movl 0, %eax
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    sarl %cl, %eax
+; CHECK-NEXT:    movl $1, %edx
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    shrl %cl, %edx
+; CHECK-NEXT:    imull %eax, %edx
+; CHECK-NEXT:    movslq %edx, %rsi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    callq *%rax
+entry:
+  br label %for.cond10.preheader
+
+trap:                                             ; preds = %for.body13
+  unreachable
+
+for.cond10.preheader:                             ; preds = %while.cond.i.i.i, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ 1, %while.cond.i.i.i ]
+  %i = trunc i64 %indvars.iv to i32
+  br label %for.body13
+
+for.body13:                                       ; preds = %for.cond10.preheader
+  %i1 = load i32, ptr null, align 4
+  %shr = ashr i32 %i1, %i
+  %shr15 = ashr i32 1, %i
+  %mul16 = mul i32 %shr15, %shr
+  %conv = sext i32 %mul16 to i64
+  call void null(ptr null, i64 %conv, ptr null)
+  br i1 false, label %while.cond.i.i.i, label %trap
+
+while.cond.i.i.i:                                 ; preds = %while.cond.i.i.i, %for.body13
+  br i1 %cmp.not.i.i.i, label %for.cond10.preheader, label %while.cond.i.i.i
+}
diff --git a/llvm/test/CodeGen/X86/combine-srem.ll b/llvm/test/CodeGen/X86/combine-srem.ll
index 49ce2455ae8c..4ed00a9d66bd 100644
--- a/llvm/test/CodeGen/X86/combine-srem.ll
+++ b/llvm/test/CodeGen/X86/combine-srem.ll
@@ -329,7 +329,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b_neg(<4 x i32> %x) {
 ; SSE-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    psrad $3, %xmm2
-; SSE-NEXT:    psrad $1, %xmm1
+; SSE-NEXT:    psrld $1, %xmm1
 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
 ; SSE-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -351,7 +351,7 @@ define <4 x i32> @combine_vec_srem_by_pow2b_neg(<4 x i32> %x) {
 ; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm3
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
 ; AVX1-NEXT:    vpsrad $3, %xmm1, %xmm3
-; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
 ; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/crc32-target-feature.ll b/llvm/test/CodeGen/X86/crc32-target-feature.ll
index ef4fafcae5dc..9dfe27e65351 100644
--- a/llvm/test/CodeGen/X86/crc32-target-feature.ll
+++ b/llvm/test/CodeGen/X86/crc32-target-feature.ll
@@ -25,5 +25,5 @@ define i32 @test3(i32 %a, i8 %b) nounwind #2 {
 declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
 
 attributes #0 = { "target-features"="+crc32" }
-attributes #1 = { "target-features"="+cx8,+fxsr,-3dnow,-3dnowa,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-mmx,-pclmul,-sha,-sse,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-x87,-xop,+crc32" }
-attributes #2 = { "target-features"="+crc32,+cx8,+fxsr,-3dnow,-3dnowa,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-mmx,-pclmul,-sha,-sse,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-x87,-xop" }
+attributes #1 = { "target-features"="+cx8,+fxsr,-3dnow,-3dnowa,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-mmx,-pclmul,-sha,-sse,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-x87,-xop,+crc32" }
+attributes #2 = { "target-features"="+crc32,+cx8,+fxsr,-3dnow,-3dnowa,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-mmx,-pclmul,-sha,-sse,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-x87,-xop" }
diff --git a/llvm/test/CodeGen/X86/fat-lto-section.ll b/llvm/test/CodeGen/X86/fat-lto-section.ll
index 30c56229a0e2..f3ca8436affb 100644
--- a/llvm/test/CodeGen/X86/fat-lto-section.ll
+++ b/llvm/test/CodeGen/X86/fat-lto-section.ll
@@ -5,6 +5,6 @@
 ; RUN:   | FileCheck %s --check-prefix=EXCLUDE
 
 ; EXCLUDE: Name               Type     {{.*}} ES Flg Lk Inf Al
-; EXCLUDE: .llvm.lto          PROGBITS {{.*}} 00   E  0   0  1
+; EXCLUDE: .llvm.lto          LLVM_LTO {{.*}} 00   E  0   0  1
 
 @a = global i32 1
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index dbc027495297..1209e2633c06 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -546,9 +546,8 @@ define <8 x i16> @freeze_ashr_vec(<8 x i16> %a0) nounwind {
 define <4 x i32> @freeze_ashr_vec_outofrange(<4 x i32> %a0) nounwind {
 ; X86-LABEL: freeze_ashr_vec_outofrange:
 ; X86:       # %bb.0:
-; X86-NEXT:    psrad $1, %xmm0
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; X86-NEXT:    psrad $2, %xmm0
+; X86-NEXT:    psrad $3, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_ashr_vec_outofrange:
@@ -660,9 +659,8 @@ define <8 x i16> @freeze_lshr_vec(<8 x i16> %a0) nounwind {
 define <4 x i32> @freeze_lshr_vec_outofrange(<4 x i32> %a0) nounwind {
 ; X86-LABEL: freeze_lshr_vec_outofrange:
 ; X86:       # %bb.0:
-; X86-NEXT:    psrld $1, %xmm0
 ; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; X86-NEXT:    psrld $2, %xmm0
+; X86-NEXT:    psrld $3, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_lshr_vec_outofrange:
diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index c6f0662cadd6..a464d78f9af3 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686--   -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,X86-SSE2
-; RUN: llc < %s -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,X64-AVX,X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64-AVX,X64-VBMI2
 
 declare i8 @llvm.fshl.i8(i8, i8, i8)
 declare i16 @llvm.fshl.i16(i16, i16, i16)
@@ -26,13 +27,13 @@ define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) nounwind {
 ; X86-SSE2-NEXT:    shldl %cl, %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shldl %cl, %esi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shldl %cl, %esi, %eax
+; X64-AVX-NEXT:    retq
   %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
   ret i32 %f
 }
@@ -58,13 +59,13 @@ define i64 @fshl_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-SSE2-NEXT:    popl %edi
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i64:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movq %rdx, %rcx
-; X64-AVX2-NEXT:    movq %rdi, %rax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-AVX2-NEXT:    shldq %cl, %rsi, %rax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i64:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movq %rdx, %rcx
+; X64-AVX-NEXT:    movq %rdi, %rax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-AVX-NEXT:    shldq %cl, %rsi, %rax
+; X64-AVX-NEXT:    retq
   %f = call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 %z)
   ret i64 %f
 }
@@ -116,18 +117,18 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl $4
 ;
-; X64-AVX2-LABEL: fshl_i128:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    testb $64, %r8b
-; X64-AVX2-NEXT:    cmovneq %rdi, %rsi
-; X64-AVX2-NEXT:    cmoveq %rcx, %rdx
-; X64-AVX2-NEXT:    cmovneq %rcx, %rdi
-; X64-AVX2-NEXT:    movq %rdi, %rax
-; X64-AVX2-NEXT:    movl %r8d, %ecx
-; X64-AVX2-NEXT:    shldq %cl, %rdx, %rax
-; X64-AVX2-NEXT:    shldq %cl, %rdi, %rsi
-; X64-AVX2-NEXT:    movq %rsi, %rdx
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i128:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    testb $64, %r8b
+; X64-AVX-NEXT:    cmovneq %rdi, %rsi
+; X64-AVX-NEXT:    cmoveq %rcx, %rdx
+; X64-AVX-NEXT:    cmovneq %rcx, %rdi
+; X64-AVX-NEXT:    movq %rdi, %rax
+; X64-AVX-NEXT:    movl %r8d, %ecx
+; X64-AVX-NEXT:    shldq %cl, %rdx, %rax
+; X64-AVX-NEXT:    shldq %cl, %rdi, %rsi
+; X64-AVX-NEXT:    movq %rsi, %rdx
+; X64-AVX-NEXT:    retq
   %f = call i128 @llvm.fshl.i128(i128 %x, i128 %y, i128 %z)
   ret i128 %f
 }
@@ -173,21 +174,21 @@ define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) nounwind {
 ; X86-SSE2-NEXT:    popl %ebx
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i37:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movq %rdx, %rcx
-; X64-AVX2-NEXT:    movabsq $137438953471, %rax # imm = 0x1FFFFFFFFF
-; X64-AVX2-NEXT:    andq %rdx, %rax
-; X64-AVX2-NEXT:    movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5
-; X64-AVX2-NEXT:    mulq %rdx
-; X64-AVX2-NEXT:    leal (%rdx,%rdx,8), %eax
-; X64-AVX2-NEXT:    leal (%rdx,%rax,4), %eax
-; X64-AVX2-NEXT:    subl %eax, %ecx
-; X64-AVX2-NEXT:    shlq $27, %rsi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-AVX2-NEXT:    shldq %cl, %rsi, %rdi
-; X64-AVX2-NEXT:    movq %rdi, %rax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i37:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movq %rdx, %rcx
+; X64-AVX-NEXT:    movabsq $137438953471, %rax # imm = 0x1FFFFFFFFF
+; X64-AVX-NEXT:    andq %rdx, %rax
+; X64-AVX-NEXT:    movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5
+; X64-AVX-NEXT:    mulq %rdx
+; X64-AVX-NEXT:    leal (%rdx,%rdx,8), %eax
+; X64-AVX-NEXT:    leal (%rdx,%rax,4), %eax
+; X64-AVX-NEXT:    subl %eax, %ecx
+; X64-AVX-NEXT:    shlq $27, %rsi
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-AVX-NEXT:    shldq %cl, %rsi, %rdi
+; X64-AVX-NEXT:    movq %rdi, %rax
+; X64-AVX-NEXT:    retq
   %f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z)
   ret i37 %f
 }
@@ -214,11 +215,11 @@ define i32 @fshl_i32_const_shift(i32 %x, i32 %y) nounwind {
 ; X86-SSE2-NEXT:    shldl $9, %ecx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_const_shift:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shldl $9, %esi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_const_shift:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shldl $9, %esi, %eax
+; X64-AVX-NEXT:    retq
   %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 9)
   ret i32 %f
 }
@@ -233,11 +234,11 @@ define i32 @fshl_i32_const_overshift(i32 %x, i32 %y) nounwind {
 ; X86-SSE2-NEXT:    shldl $9, %ecx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_const_overshift:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shldl $9, %esi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_const_overshift:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shldl $9, %esi, %eax
+; X64-AVX-NEXT:    retq
   %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 41)
   ret i32 %f
 }
@@ -254,11 +255,11 @@ define i64 @fshl_i64_const_overshift(i64 %x, i64 %y) nounwind {
 ; X86-SSE2-NEXT:    shrdl $23, %ecx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i64_const_overshift:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movq %rdi, %rax
-; X64-AVX2-NEXT:    shldq $41, %rsi, %rax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i64_const_overshift:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movq %rdi, %rax
+; X64-AVX-NEXT:    shldq $41, %rsi, %rax
+; X64-AVX-NEXT:    retq
   %f = call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 105)
   ret i64 %f
 }
@@ -287,13 +288,13 @@ define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) nounwind {
 ; X86-SSE2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrdl %cl, %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shrdl %cl, %edi, %eax
+; X64-AVX-NEXT:    retq
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
   ret i32 %f
 }
@@ -340,22 +341,22 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) nounwind {
 ; X86-SSE2-NEXT:    popl %ebx
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i37:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movq %rdx, %rcx
-; X64-AVX2-NEXT:    movabsq $137438953471, %rax # imm = 0x1FFFFFFFFF
-; X64-AVX2-NEXT:    andq %rdx, %rax
-; X64-AVX2-NEXT:    movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5
-; X64-AVX2-NEXT:    mulq %rdx
-; X64-AVX2-NEXT:    leal (%rdx,%rdx,8), %eax
-; X64-AVX2-NEXT:    leal (%rdx,%rax,4), %eax
-; X64-AVX2-NEXT:    subl %eax, %ecx
-; X64-AVX2-NEXT:    addl $27, %ecx
-; X64-AVX2-NEXT:    shlq $27, %rsi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-AVX2-NEXT:    shrdq %cl, %rdi, %rsi
-; X64-AVX2-NEXT:    movq %rsi, %rax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i37:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movq %rdx, %rcx
+; X64-AVX-NEXT:    movabsq $137438953471, %rax # imm = 0x1FFFFFFFFF
+; X64-AVX-NEXT:    andq %rdx, %rax
+; X64-AVX-NEXT:    movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5
+; X64-AVX-NEXT:    mulq %rdx
+; X64-AVX-NEXT:    leal (%rdx,%rdx,8), %eax
+; X64-AVX-NEXT:    leal (%rdx,%rax,4), %eax
+; X64-AVX-NEXT:    subl %eax, %ecx
+; X64-AVX-NEXT:    addl $27, %ecx
+; X64-AVX-NEXT:    shlq $27, %rsi
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-AVX-NEXT:    shrdq %cl, %rdi, %rsi
+; X64-AVX-NEXT:    movq %rsi, %rax
+; X64-AVX-NEXT:    retq
   %f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z)
   ret i37 %f
 }
@@ -382,11 +383,11 @@ define i32 @fshl_i32_demandedbits(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shldl $9, %ecx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_demandedbits:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shldl $9, %esi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_demandedbits:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shldl $9, %esi, %eax
+; X64-AVX-NEXT:    retq
   %x = or i32 %a0, 2147483648
   %y = or i32 %a1, 1
   %res = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 9)
@@ -401,11 +402,11 @@ define i32 @fshr_i32_demandedbits(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shrdl $9, %ecx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_demandedbits:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shldl $23, %esi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_demandedbits:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shldl $23, %esi, %eax
+; X64-AVX-NEXT:    retq
   %x = or i32 %a0, 2147483648
   %y = or i32 %a1, 1
   %res = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 9)
@@ -422,12 +423,12 @@ define i32 @fshl_i32_undef0(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shldl %cl, %eax, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_undef0:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shldl %cl, %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_undef0:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shldl %cl, %edi, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshl.i32(i32 undef, i32 %a0, i32 %a1)
   ret i32 %res
 }
@@ -442,13 +443,13 @@ define i32 @fshl_i32_undef0_msk(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shldl %cl, %eax, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_undef0_msk:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    andl $7, %ecx
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shldl %cl, %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_undef0_msk:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    andl $7, %ecx
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shldl %cl, %edi, %eax
+; X64-AVX-NEXT:    retq
   %m = and i32 %a1, 7
   %res = call i32 @llvm.fshl.i32(i32 undef, i32 %a0, i32 %m)
   ret i32 %res
@@ -461,15 +462,43 @@ define i32 @fshl_i32_undef0_cst(i32 %a0) nounwind {
 ; X86-SSE2-NEXT:    shrl $23, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_undef0_cst:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shrl $23, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_undef0_cst:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shrl $23, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshl.i32(i32 undef, i32 %a0, i32 9)
   ret i32 %res
 }
 
+define <4 x i32> @fshl_v4i32_undef0_cst(<4 x i32> %a0) nounwind {
+; X86-SSE2-LABEL: fshl_v4i32_undef0_cst:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $20, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    psrld $21, %xmm2
+; X86-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $22, %xmm1
+; X86-SSE2-NEXT:    psrld $23, %xmm0
+; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; X86-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: fshl_v4i32_undef0_cst:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    retq
+;
+; X64-VBMI2-LABEL: fshl_v4i32_undef0_cst:
+; X64-VBMI2:       # %bb.0:
+; X64-VBMI2-NEXT:    vpshldvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-VBMI2-NEXT:    retq
+  %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> undef, <4 x i32> %a0, <4 x i32> <i32 9, i32 10, i32 11, i32 12>)
+  ret <4 x i32> %res
+}
+
 define i32 @fshl_i32_undef1(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-LABEL: fshl_i32_undef1:
 ; X86-SSE2:       # %bb.0:
@@ -478,13 +507,13 @@ define i32 @fshl_i32_undef1(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shldl %cl, %eax, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_undef1:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shldl %cl, %eax, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_undef1:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shldl %cl, %eax, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshl.i32(i32 %a0, i32 undef, i32 %a1)
   ret i32 %res
 }
@@ -498,14 +527,14 @@ define i32 @fshl_i32_undef1_msk(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shll %cl, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_undef1_msk:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    andb $7, %cl
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shll %cl, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_undef1_msk:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    andb $7, %cl
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shll %cl, %eax
+; X64-AVX-NEXT:    retq
   %m = and i32 %a1, 7
   %res = call i32 @llvm.fshl.i32(i32 %a0, i32 undef, i32 %m)
   ret i32 %res
@@ -518,15 +547,34 @@ define i32 @fshl_i32_undef1_cst(i32 %a0) nounwind {
 ; X86-SSE2-NEXT:    shll $9, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_undef1_cst:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shll $9, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_undef1_cst:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shll $9, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshl.i32(i32 %a0, i32 undef, i32 9)
   ret i32 %res
 }
 
+define <4 x i32> @fshl_v4i32_undef1_cst(<4 x i32> %a0) nounwind {
+; X86-SSE2-LABEL: fshl_v4i32_undef1_cst:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-SSE2-NEXT:    retl
+;
+; X64-AVX-LABEL: fshl_v4i32_undef1_cst:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    retq
+  %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 9, i32 10, i32 11, i32 12>)
+  ret <4 x i32> %res
+}
+
 define i32 @fshl_i32_undef2(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-LABEL: fshl_i32_undef2:
 ; X86-SSE2:       # %bb.0:
@@ -535,11 +583,11 @@ define i32 @fshl_i32_undef2(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shldl %cl, %ecx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_undef2:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shldl %cl, %esi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_undef2:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shldl %cl, %esi, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshl.i32(i32 %a0, i32 %a1, i32 undef)
   ret i32 %res
 }
@@ -552,13 +600,13 @@ define i32 @fshr_i32_undef0(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shrdl %cl, %eax, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_undef0:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrdl %cl, %eax, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_undef0:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shrdl %cl, %eax, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshr.i32(i32 undef, i32 %a0, i32 %a1)
   ret i32 %res
 }
@@ -572,14 +620,14 @@ define i32 @fshr_i32_undef0_msk(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shrl %cl, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_undef0_msk:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    andb $7, %cl
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrl %cl, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_undef0_msk:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    andb $7, %cl
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shrl %cl, %eax
+; X64-AVX-NEXT:    retq
   %m = and i32 %a1, 7
   %res = call i32 @llvm.fshr.i32(i32 undef, i32 %a0, i32 %m)
   ret i32 %res
@@ -592,15 +640,38 @@ define i32 @fshr_i32_undef0_cst(i32 %a0) nounwind {
 ; X86-SSE2-NEXT:    shrl $9, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_undef0_cst:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shrl $9, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_undef0_cst:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shrl $9, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshr.i32(i32 undef, i32 %a0, i32 9)
   ret i32 %res
 }
 
+define <4 x i32> @fshr_v4i32_undef0_cst(<4 x i32> %a0) nounwind {
+; X86-SSE2-LABEL: fshr_v4i32_undef0_cst:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $12, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    psrld $11, %xmm2
+; X86-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $10, %xmm1
+; X86-SSE2-NEXT:    psrld $9, %xmm0
+; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; X86-SSE2-NEXT:    retl
+;
+; X64-AVX-LABEL: fshr_v4i32_undef0_cst:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    retq
+  %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> undef, <4 x i32> %a0, <4 x i32> <i32 9, i32 10, i32 11, i32 12>)
+  ret <4 x i32> %res
+}
+
 define i32 @fshr_i32_undef1(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-LABEL: fshr_i32_undef1:
 ; X86-SSE2:       # %bb.0:
@@ -609,12 +680,12 @@ define i32 @fshr_i32_undef1(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shrdl %cl, %eax, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_undef1:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrdl %cl, %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_undef1:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shrdl %cl, %edi, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshr.i32(i32 %a0, i32 undef, i32 %a1)
   ret i32 %res
 }
@@ -629,13 +700,13 @@ define i32 @fshr_i32_undef1_msk(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shrdl %cl, %eax, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_undef1_msk:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    andl $7, %ecx
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrdl %cl, %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_undef1_msk:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    andl $7, %ecx
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shrdl %cl, %edi, %eax
+; X64-AVX-NEXT:    retq
   %m = and i32 %a1, 7
   %res = call i32 @llvm.fshr.i32(i32 %a0, i32 undef, i32 %m)
   ret i32 %res
@@ -648,15 +719,39 @@ define i32 @fshr_i32_undef1_cst(i32 %a0) nounwind {
 ; X86-SSE2-NEXT:    shll $23, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_undef1_cst:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shll $23, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_undef1_cst:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shll $23, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshr.i32(i32 %a0, i32 undef, i32 9)
   ret i32 %res
 }
 
+define <4 x i32> @fshr_v4i32_undef1_cst(<4 x i32> %a0) nounwind {
+; X86-SSE2-LABEL: fshr_v4i32_undef1_cst:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: fshr_v4i32_undef1_cst:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    retq
+;
+; X64-VBMI2-LABEL: fshr_v4i32_undef1_cst:
+; X64-VBMI2:       # %bb.0:
+; X64-VBMI2-NEXT:    vpshrdvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-VBMI2-NEXT:    retq
+  %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 9, i32 10, i32 11, i32 12>)
+  ret <4 x i32> %res
+}
+
 define i32 @fshr_i32_undef2(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-LABEL: fshr_i32_undef2:
 ; X86-SSE2:       # %bb.0:
@@ -665,11 +760,11 @@ define i32 @fshr_i32_undef2(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shrdl %cl, %ecx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_undef2:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    shrdl %cl, %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_undef2:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    shrdl %cl, %edi, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshr.i32(i32 %a0, i32 %a1, i32 undef)
   ret i32 %res
 }
@@ -685,13 +780,13 @@ define i32 @fshl_i32_zero0(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shldl %cl, %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_zero0:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    xorl %eax, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shldl %cl, %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_zero0:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    xorl %eax, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shldl %cl, %edi, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshl.i32(i32 0, i32 %a0, i32 %a1)
   ret i32 %res
 }
@@ -703,11 +798,11 @@ define i32 @fshl_i32_zero0_cst(i32 %a0) nounwind {
 ; X86-SSE2-NEXT:    shrl $23, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_zero0_cst:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shrl $23, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_zero0_cst:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shrl $23, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshl.i32(i32 0, i32 %a0, i32 9)
   ret i32 %res
 }
@@ -721,14 +816,14 @@ define i32 @fshl_i32_zero1(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shldl %cl, %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_zero1:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shldl %cl, %edx, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_zero1:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    xorl %edx, %edx
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shldl %cl, %edx, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshl.i32(i32 %a0, i32 0, i32 %a1)
   ret i32 %res
 }
@@ -740,11 +835,11 @@ define i32 @fshl_i32_zero1_cst(i32 %a0) nounwind {
 ; X86-SSE2-NEXT:    shll $9, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_zero1_cst:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shll $9, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_zero1_cst:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shll $9, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshl.i32(i32 %a0, i32 0, i32 9)
   ret i32 %res
 }
@@ -758,14 +853,14 @@ define i32 @fshr_i32_zero0(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_zero0:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrdl %cl, %edx, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_zero0:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    xorl %edx, %edx
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shrdl %cl, %edx, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshr.i32(i32 0, i32 %a0, i32 %a1)
   ret i32 %res
 }
@@ -777,11 +872,11 @@ define i32 @fshr_i32_zero0_cst(i32 %a0) nounwind {
 ; X86-SSE2-NEXT:    shrl $9, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_zero0_cst:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shrl $9, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_zero0_cst:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shrl $9, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshr.i32(i32 0, i32 %a0, i32 9)
   ret i32 %res
 }
@@ -795,13 +890,13 @@ define i32 @fshr_i32_zero1(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_zero1:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %ecx
-; X64-AVX2-NEXT:    xorl %eax, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrdl %cl, %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_zero1:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %ecx
+; X64-AVX-NEXT:    xorl %eax, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shrdl %cl, %edi, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshr.i32(i32 %a0, i32 0, i32 %a1)
   ret i32 %res
 }
@@ -813,11 +908,11 @@ define i32 @fshr_i32_zero1_cst(i32 %a0) nounwind {
 ; X86-SSE2-NEXT:    shll $23, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_zero1_cst:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shll $23, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_zero1_cst:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shll $23, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshr.i32(i32 %a0, i32 0, i32 9)
   ret i32 %res
 }
@@ -830,10 +925,10 @@ define i32 @fshl_i32_zero2(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_zero2:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_zero2:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshl.i32(i32 %a0, i32 %a1, i32 0)
   ret i32 %res
 }
@@ -844,10 +939,10 @@ define i32 @fshr_i32_zero2(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_zero2:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_zero2:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    retq
   %res = call i32 @llvm.fshr.i32(i32 %a0, i32 %a1, i32 0)
   ret i32 %res
 }
@@ -862,11 +957,11 @@ define i32 @fshr_i32_const_shift(i32 %x, i32 %y) nounwind {
 ; X86-SSE2-NEXT:    shrdl $9, %ecx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_const_shift:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shldl $23, %esi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_const_shift:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shldl $23, %esi, %eax
+; X64-AVX-NEXT:    retq
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 9)
   ret i32 %f
 }
@@ -881,11 +976,11 @@ define i32 @fshr_i32_const_overshift(i32 %x, i32 %y) nounwind {
 ; X86-SSE2-NEXT:    shrdl $9, %ecx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_const_overshift:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    shldl $23, %esi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_const_overshift:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    shldl $23, %esi, %eax
+; X64-AVX-NEXT:    retq
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 41)
   ret i32 %f
 }
@@ -902,11 +997,11 @@ define i64 @fshr_i64_const_overshift(i64 %x, i64 %y) nounwind {
 ; X86-SSE2-NEXT:    shldl $23, %ecx, %edx
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i64_const_overshift:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movq %rdi, %rax
-; X64-AVX2-NEXT:    shldq $23, %rsi, %rax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i64_const_overshift:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movq %rdi, %rax
+; X64-AVX-NEXT:    shldq $23, %rsi, %rax
+; X64-AVX-NEXT:    retq
   %f = call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 105)
   ret i64 %f
 }
@@ -928,10 +1023,10 @@ define i32 @fshl_i32_shift_by_bitwidth(i32 %x, i32 %y) nounwind {
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshl_i32_shift_by_bitwidth:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshl_i32_shift_by_bitwidth:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edi, %eax
+; X64-AVX-NEXT:    retq
   %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 32)
   ret i32 %f
 }
@@ -942,10 +1037,10 @@ define i32 @fshr_i32_shift_by_bitwidth(i32 %x, i32 %y) nounwind {
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_i32_shift_by_bitwidth:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_i32_shift_by_bitwidth:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    retq
   %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 32)
   ret i32 %f
 }
@@ -964,10 +1059,10 @@ define <4 x i32> @fshr_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) nounw
 ; X86-SSE2-NEXT:    movaps %xmm1, %xmm0
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: fshr_v4i32_shift_by_bitwidth:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovaps %xmm1, %xmm0
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: fshr_v4i32_shift_by_bitwidth:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovaps %xmm1, %xmm0
+; X64-AVX-NEXT:    retq
   %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
   ret <4 x i32> %f
 }
@@ -996,30 +1091,30 @@ define void @PR45265(i32 %0, ptr nocapture readonly %1) nounwind {
 ; X86-SSE2-NEXT:    shldl $24, %edx, %ecx
 ; X86-SSE2-NEXT:    xorl %eax, %ecx
 ; X86-SSE2-NEXT:    orl %ecx, %edi
-; X86-SSE2-NEXT:    jne .LBB46_1
+; X86-SSE2-NEXT:    jne .LBB50_1
 ; X86-SSE2-NEXT:  # %bb.2:
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %edi
 ; X86-SSE2-NEXT:    jmp _Z3foov # TAILCALL
-; X86-SSE2-NEXT:  .LBB46_1:
+; X86-SSE2-NEXT:  .LBB50_1:
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %edi
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: PR45265:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movslq %edi, %rax
-; X64-AVX2-NEXT:    leaq (%rax,%rax,2), %rcx
-; X64-AVX2-NEXT:    movsbq 10(%rsi,%rcx,4), %rdx
-; X64-AVX2-NEXT:    shlq $16, %rdx
-; X64-AVX2-NEXT:    movzwl 8(%rsi,%rcx,4), %edi
-; X64-AVX2-NEXT:    orq %rdx, %rdi
-; X64-AVX2-NEXT:    movq (%rsi,%rcx,4), %rcx
-; X64-AVX2-NEXT:    shrdq $40, %rdi, %rcx
-; X64-AVX2-NEXT:    cmpq %rax, %rcx
-; X64-AVX2-NEXT:    je _Z3foov # TAILCALL
-; X64-AVX2-NEXT:  # %bb.1:
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: PR45265:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movslq %edi, %rax
+; X64-AVX-NEXT:    leaq (%rax,%rax,2), %rcx
+; X64-AVX-NEXT:    movsbq 10(%rsi,%rcx,4), %rdx
+; X64-AVX-NEXT:    shlq $16, %rdx
+; X64-AVX-NEXT:    movzwl 8(%rsi,%rcx,4), %edi
+; X64-AVX-NEXT:    orq %rdx, %rdi
+; X64-AVX-NEXT:    movq (%rsi,%rcx,4), %rcx
+; X64-AVX-NEXT:    shrdq $40, %rdi, %rcx
+; X64-AVX-NEXT:    cmpq %rax, %rcx
+; X64-AVX-NEXT:    je _Z3foov # TAILCALL
+; X64-AVX-NEXT:  # %bb.1:
+; X64-AVX-NEXT:    retq
   %3 = sext i32 %0 to i64
   %4 = getelementptr inbounds %struct.S, ptr %1, i64 %3
   %5 = bitcast ptr %4 to ptr
@@ -1052,15 +1147,15 @@ define i32 @or_shl_fshl(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: or_shl_fshl:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    shll %cl, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shldl %cl, %esi, %edi
-; X64-AVX2-NEXT:    orl %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: or_shl_fshl:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    shll %cl, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shldl %cl, %esi, %edi
+; X64-AVX-NEXT:    orl %edi, %eax
+; X64-AVX-NEXT:    retq
   %shy = shl i32 %y, %s
   %fun = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %s)
   %or = or i32 %fun, %shy
@@ -1078,15 +1173,15 @@ define i32 @or_shl_rotl(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-NEXT:    orl %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: or_shl_rotl:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    shll %cl, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    roll %cl, %eax
-; X64-AVX2-NEXT:    orl %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: or_shl_rotl:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    shll %cl, %edi
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    roll %cl, %eax
+; X64-AVX-NEXT:    orl %edi, %eax
+; X64-AVX-NEXT:    retq
   %shx = shl i32 %x, %s
   %rot = call i32 @llvm.fshl.i32(i32 %y, i32 %y, i32 %s)
   %or = or i32 %rot, %shx
@@ -1107,15 +1202,15 @@ define i32 @or_shl_fshl_commute(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: or_shl_fshl_commute:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    shll %cl, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shldl %cl, %esi, %edi
-; X64-AVX2-NEXT:    orl %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: or_shl_fshl_commute:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    shll %cl, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shldl %cl, %esi, %edi
+; X64-AVX-NEXT:    orl %edi, %eax
+; X64-AVX-NEXT:    retq
   %shy = shl i32 %y, %s
   %fun = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %s)
   %or = or i32 %shy, %fun
@@ -1133,15 +1228,15 @@ define i32 @or_shl_rotl_commute(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-NEXT:    orl %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: or_shl_rotl_commute:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    shll %cl, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    roll %cl, %eax
-; X64-AVX2-NEXT:    orl %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: or_shl_rotl_commute:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    shll %cl, %edi
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    roll %cl, %eax
+; X64-AVX-NEXT:    orl %edi, %eax
+; X64-AVX-NEXT:    retq
   %shx = shl i32 %x, %s
   %rot = call i32 @llvm.fshl.i32(i32 %y, i32 %y, i32 %s)
   %or = or i32 %shx, %rot
@@ -1162,15 +1257,15 @@ define i32 @or_lshr_fshr(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: or_lshr_fshr:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    shrl %cl, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrdl %cl, %esi, %edi
-; X64-AVX2-NEXT:    orl %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: or_lshr_fshr:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    shrl %cl, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shrdl %cl, %esi, %edi
+; X64-AVX-NEXT:    orl %edi, %eax
+; X64-AVX-NEXT:    retq
   %shy = lshr i32 %y, %s
   %fun = call i32 @llvm.fshr.i32(i32 %y, i32 %x, i32 %s)
   %or = or i32 %fun, %shy
@@ -1188,15 +1283,15 @@ define i32 @or_lshr_rotr(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-NEXT:    orl %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: or_lshr_rotr:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    shrl %cl, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    rorl %cl, %eax
-; X64-AVX2-NEXT:    orl %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: or_lshr_rotr:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    shrl %cl, %edi
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    rorl %cl, %eax
+; X64-AVX-NEXT:    orl %edi, %eax
+; X64-AVX-NEXT:    retq
   %shx = lshr i32 %x, %s
   %rot = call i32 @llvm.fshr.i32(i32 %y, i32 %y, i32 %s)
   %or = or i32 %rot, %shx
@@ -1217,15 +1312,15 @@ define i32 @or_lshr_fshr_commute(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: or_lshr_fshr_commute:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    shrl %cl, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrdl %cl, %esi, %edi
-; X64-AVX2-NEXT:    orl %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: or_lshr_fshr_commute:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    shrl %cl, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shrdl %cl, %esi, %edi
+; X64-AVX-NEXT:    orl %edi, %eax
+; X64-AVX-NEXT:    retq
   %shy = lshr i32 %y, %s
   %fun = call i32 @llvm.fshr.i32(i32 %y, i32 %x, i32 %s)
   %or = or i32 %shy, %fun
@@ -1243,15 +1338,15 @@ define i32 @or_lshr_rotr_commute(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-NEXT:    orl %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: or_lshr_rotr_commute:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    shrl %cl, %edi
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    rorl %cl, %eax
-; X64-AVX2-NEXT:    orl %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: or_lshr_rotr_commute:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    shrl %cl, %edi
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    rorl %cl, %eax
+; X64-AVX-NEXT:    orl %edi, %eax
+; X64-AVX-NEXT:    retq
   %shx = lshr i32 %x, %s
   %rot = call i32 @llvm.fshr.i32(i32 %y, i32 %y, i32 %s)
   %or = or i32 %shx, %rot
@@ -1267,13 +1362,13 @@ define i32 @or_shl_fshl_simplify(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-NEXT:    shldl %cl, %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: or_shl_fshl_simplify:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shldl %cl, %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: or_shl_fshl_simplify:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shldl %cl, %edi, %eax
+; X64-AVX-NEXT:    retq
   %shy = shl i32 %y, %s
   %fun = call i32 @llvm.fshl.i32(i32 %y, i32 %x, i32 %s)
   %or = or i32 %fun, %shy
@@ -1289,13 +1384,13 @@ define i32 @or_lshr_fshr_simplify(i32 %x, i32 %y, i32 %s) nounwind {
 ; X86-SSE2-NEXT:    shrdl %cl, %edx, %eax
 ; X86-SSE2-NEXT:    retl
 ;
-; X64-AVX2-LABEL: or_lshr_fshr_simplify:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    movl %esi, %eax
-; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-AVX2-NEXT:    shrdl %cl, %edi, %eax
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: or_lshr_fshr_simplify:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    movl %edx, %ecx
+; X64-AVX-NEXT:    movl %esi, %eax
+; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX-NEXT:    shrdl %cl, %edi, %eax
+; X64-AVX-NEXT:    retq
   %shy = lshr i32 %y, %s
   %fun = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %s)
   %or = or i32 %shy, %fun
diff --git a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll b/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll
index 2f5a36865d4a..f8e25028cfde 100644
--- a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll
+++ b/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll
@@ -8,17 +8,13 @@ target triple = "x86_64-unknown-linux-gnu"
 define dso_local i32 @main() local_unnamed_addr #0 !dbg !7 {
 entry:
   tail call void @llvm.prefetch(ptr inttoptr (i64 291 to ptr), i32 0, i32 0, i32 1), !dbg !9
-  tail call void @llvm.x86.avx512.gatherpf.dpd.512(i8 97, <8 x i32> undef, ptr null, i32 1, i32 2), !dbg !10
   ret i32 291, !dbg !11
 }
 
 ; Function Attrs: inaccessiblemem_or_argmemonly nounwind
 declare void @llvm.prefetch(ptr nocapture readonly, i32, i32, i32) #1
 
-; Function Attrs: argmemonly nounwind
-declare void @llvm.x86.avx512.gatherpf.dpd.512(i8, <8 x i32>, ptr, i32, i32) #2
-
-attributes #0 = {"target-cpu"="x86-64" "target-features"="+avx512pf,+sse4.2,+ssse3"}
+attributes #0 = {"target-cpu"="x86-64" "target-features"="+sse4.2,+ssse3"}
 attributes #1 = { inaccessiblemem_or_argmemonly nounwind }
 attributes #2 = { argmemonly nounwind }
 
@@ -43,4 +39,3 @@ attributes #2 = { argmemonly nounwind }
 ;CHECK:       # %bb.0:
 ;CHECK:       prefetchnta 291
 ;CHECK-NOT:   prefetchnta 42(%rax,%ymm0)
-;CHECK:       vgatherpf1dpd (%rax,%ymm0) {%k1}
diff --git a/llvm/test/CodeGen/X86/issue76416.ll b/llvm/test/CodeGen/X86/issue76416.ll
new file mode 100644
index 000000000000..d0f7fe684a84
--- /dev/null
+++ b/llvm/test/CodeGen/X86/issue76416.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=x86_64-unknown-freebsd15.0 < %s | FileCheck %s
+
+%struct.anon.5.28.78.99.149.119 = type { [4 x i8] }
+
+@vga_load_state_p = external dso_local global ptr, align 8
+@vga_load_state_data = external dso_local global i8, align 1
+
+define dso_local void @vga_load_state() #0 {
+; CHECK-LABEL: vga_load_state:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    cmpl $3, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    jg .LBB0_3
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_2: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    incl -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    cmpl $3, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    jle .LBB0_2
+; CHECK-NEXT:  .LBB0_3: # %for.end
+; CHECK-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_4: # %for.cond1
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    movq vga_load_state_p(%rip), %rax
+; CHECK-NEXT:    movslq -{{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    movzbl (%rax,%rcx), %eax
+; CHECK-NEXT:    movb %al, vga_load_state_data(%rip)
+; CHECK-NEXT:    leal 1(%rcx), %eax
+; CHECK-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    jmp .LBB0_4
+entry:
+  %i = alloca i32, align 4
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i1 = load i32, ptr %i, align 4
+  %cmp = icmp slt i32 %i1, 4
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  call void asm sideeffect "", "{ax},~{dirflag},~{fpsr},~{flags}"(i8 0) #1
+  %i2 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %i2, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  store i32 0, ptr %i, align 4
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.cond1, %for.end
+  call void asm sideeffect "", "N{dx},~{dirflag},~{fpsr},~{flags}"(i32 poison) #1
+  %i3 = load ptr, ptr @vga_load_state_p, align 8
+  %regs = getelementptr inbounds %struct.anon.5.28.78.99.149.119, ptr %i3, i32 0, i32 0
+  %i4 = load i32, ptr %i, align 4
+  %idxprom = sext i32 %i4 to i64
+  %arrayidx = getelementptr inbounds [4 x i8], ptr %regs, i64 0, i64 %idxprom
+  %i5 = load i8, ptr %arrayidx, align 1
+  store i8 %i5, ptr @vga_load_state_data, align 1
+  %i6 = load i32, ptr %i, align 4
+  %inc5 = add nsw i32 %i6, 1
+  store i32 %inc5, ptr %i, align 4
+  br label %for.cond1, !llvm.loop !0
+}
+
+attributes #0 = { "tune-cpu"="generic" }
+attributes #1 = { nounwind }
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.mustprogress"}
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
index c6e8b7532505..3b5ff12fb4ec 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
@@ -31,10 +31,8 @@ define <4 x i32> @vec128_i32_signed_reg_reg(<4 x i32> %a1, <4 x i32> %a2) nounwi
 ; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    psubd %xmm1, %xmm4
-; SSE2-NEXT:    psubd %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm2, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    psubd %xmm4, %xmm2
 ; SSE2-NEXT:    psrld $1, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pmuludq %xmm3, %xmm2
@@ -179,25 +177,22 @@ define <4 x i32> @vec128_i32_unsigned_reg_reg(<4 x i32> %a1, <4 x i32> %a2) noun
 ; SSE2-LABEL: vec128_i32_unsigned_reg_reg:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psubd %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,1,1,1]
-; SSE2-NEXT:    por %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    psubd %xmm1, %xmm4
-; SSE2-NEXT:    psubd %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm2, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,1,1]
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    psubd %xmm3, %xmm2
 ; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pmuludq %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm1, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm3, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; SSE2-NEXT:    paddd %xmm2, %xmm0
 ; SSE2-NEXT:    retq
@@ -349,10 +344,8 @@ define <4 x i32> @vec128_i32_signed_mem_reg(ptr %a1_addr, <4 x i32> %a2) nounwin
 ; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    psubd %xmm0, %xmm4
-; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm2, %xmm4
-; SSE2-NEXT:    pandn %xmm0, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    psubd %xmm4, %xmm2
 ; SSE2-NEXT:    psrld $1, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pmuludq %xmm3, %xmm2
@@ -511,10 +504,8 @@ define <4 x i32> @vec128_i32_signed_reg_mem(<4 x i32> %a1, ptr %a2_addr) nounwin
 ; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    psubd %xmm1, %xmm4
-; SSE2-NEXT:    psubd %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm2, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    psubd %xmm4, %xmm2
 ; SSE2-NEXT:    psrld $1, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pmuludq %xmm3, %xmm2
@@ -674,10 +665,8 @@ define <4 x i32> @vec128_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    psubd %xmm0, %xmm4
-; SSE2-NEXT:    psubd %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm2, %xmm4
-; SSE2-NEXT:    pandn %xmm0, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    psubd %xmm4, %xmm2
 ; SSE2-NEXT:    psrld $1, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pmuludq %xmm3, %xmm2
@@ -844,74 +833,66 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; SSE2-LABEL: vec128_i64_signed_reg_reg:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psubq %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1]
-; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm0, %xmm4
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    pandn %xmm1, %xmm5
-; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    por %xmm5, %xmm3
-; SSE2-NEXT:    psubq %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    psrlq $1, %xmm1
-; SSE2-NEXT:    psrlq $33, %xmm3
-; SSE2-NEXT:    pmuludq %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,1]
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    psubq %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    psrlq $1, %xmm3
+; SSE2-NEXT:    psrlq $33, %xmm2
+; SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    psrlq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm1, %xmm4
-; SSE2-NEXT:    paddq %xmm3, %xmm4
+; SSE2-NEXT:    pmuludq %xmm3, %xmm4
+; SSE2-NEXT:    paddq %xmm2, %xmm4
 ; SSE2-NEXT:    psllq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    pmuludq %xmm1, %xmm3
+; SSE2-NEXT:    paddq %xmm3, %xmm0
 ; SSE2-NEXT:    paddq %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: vec128_i64_signed_reg_reg:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    pxor %xmm0, %xmm3
-; SSE41-NEXT:    pxor %xmm2, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
-; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm3 = [1,1]
-; SSE41-NEXT:    por %xmm0, %xmm3
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psubq %xmm1, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm4
-; SSE41-NEXT:    psubq %xmm1, %xmm4
-; SSE41-NEXT:    psubq %xmm2, %xmm1
-; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
-; SSE41-NEXT:    movapd %xmm1, %xmm0
-; SSE41-NEXT:    psrlq $1, %xmm0
-; SSE41-NEXT:    psrlq $33, %xmm1
-; SSE41-NEXT:    pmuludq %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm1, %xmm2
+; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm1 = [1,1]
+; SSE41-NEXT:    por %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm3
+; SSE41-NEXT:    psubq %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlq $1, %xmm3
+; SSE41-NEXT:    psrlq $33, %xmm2
+; SSE41-NEXT:    pmuludq %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
 ; SSE41-NEXT:    psrlq $32, %xmm4
-; SSE41-NEXT:    pmuludq %xmm0, %xmm4
-; SSE41-NEXT:    paddq %xmm1, %xmm4
+; SSE41-NEXT:    pmuludq %xmm3, %xmm4
+; SSE41-NEXT:    paddq %xmm2, %xmm4
 ; SSE41-NEXT:    psllq $32, %xmm4
-; SSE41-NEXT:    pmuludq %xmm3, %xmm0
-; SSE41-NEXT:    paddq %xmm2, %xmm0
+; SSE41-NEXT:    pmuludq %xmm1, %xmm3
+; SSE41-NEXT:    paddq %xmm3, %xmm0
 ; SSE41-NEXT:    paddq %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -919,9 +900,9 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
+; AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpsrlq $1, %xmm1, %xmm2
 ; AVX-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
@@ -938,9 +919,9 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; XOP:       # %bb.0:
 ; XOP-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm2
 ; XOP-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; XOP-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; XOP-NEXT:    vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; XOP-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm2
 ; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; XOP-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
@@ -1027,74 +1008,66 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; SSE2-LABEL: vec128_i64_unsigned_reg_reg:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    psubq %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1]
-; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm0, %xmm4
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    pandn %xmm1, %xmm5
-; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    por %xmm5, %xmm3
-; SSE2-NEXT:    psubq %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    psrlq $1, %xmm1
-; SSE2-NEXT:    psrlq $33, %xmm3
-; SSE2-NEXT:    pmuludq %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,1]
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    psubq %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
+; SSE2-NEXT:    psrlq $1, %xmm3
+; SSE2-NEXT:    psrlq $33, %xmm2
+; SSE2-NEXT:    pmuludq %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    psrlq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm1, %xmm4
-; SSE2-NEXT:    paddq %xmm3, %xmm4
+; SSE2-NEXT:    pmuludq %xmm3, %xmm4
+; SSE2-NEXT:    paddq %xmm2, %xmm4
 ; SSE2-NEXT:    psllq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm2, %xmm1
-; SSE2-NEXT:    paddq %xmm1, %xmm0
+; SSE2-NEXT:    pmuludq %xmm1, %xmm3
+; SSE2-NEXT:    paddq %xmm3, %xmm0
 ; SSE2-NEXT:    paddq %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: vec128_i64_unsigned_reg_reg:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    pxor %xmm0, %xmm3
-; SSE41-NEXT:    pxor %xmm2, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm0
-; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm3 = [1,1]
-; SSE41-NEXT:    por %xmm0, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    psubq %xmm1, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm0, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm4
-; SSE41-NEXT:    psubq %xmm1, %xmm4
-; SSE41-NEXT:    psubq %xmm2, %xmm1
-; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
-; SSE41-NEXT:    movapd %xmm1, %xmm0
-; SSE41-NEXT:    psrlq $1, %xmm0
-; SSE41-NEXT:    psrlq $33, %xmm1
-; SSE41-NEXT:    pmuludq %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm1, %xmm2
+; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm1 = [1,1]
+; SSE41-NEXT:    por %xmm2, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm3
+; SSE41-NEXT:    psubq %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrlq $1, %xmm3
+; SSE41-NEXT:    psrlq $33, %xmm2
+; SSE41-NEXT:    pmuludq %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm1, %xmm4
 ; SSE41-NEXT:    psrlq $32, %xmm4
-; SSE41-NEXT:    pmuludq %xmm0, %xmm4
-; SSE41-NEXT:    paddq %xmm1, %xmm4
+; SSE41-NEXT:    pmuludq %xmm3, %xmm4
+; SSE41-NEXT:    paddq %xmm2, %xmm4
 ; SSE41-NEXT:    psllq $32, %xmm4
-; SSE41-NEXT:    pmuludq %xmm3, %xmm0
-; SSE41-NEXT:    paddq %xmm2, %xmm0
+; SSE41-NEXT:    pmuludq %xmm1, %xmm3
+; SSE41-NEXT:    paddq %xmm3, %xmm0
 ; SSE41-NEXT:    paddq %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1106,9 +1079,9 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm2
 ; AVX1-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
@@ -1128,9 +1101,9 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX2-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX2-NEXT:    vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
+; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; AVX2-NEXT:    vpsrlq $1, %xmm1, %xmm2
 ; AVX2-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX2-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
@@ -1147,9 +1120,9 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; XOP:       # %bb.0:
 ; XOP-NEXT:    vpcomgtuq %xmm1, %xmm0, %xmm2
 ; XOP-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; XOP-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; XOP-NEXT:    vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; XOP-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm2
 ; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; XOP-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
@@ -1239,76 +1212,67 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa (%rdi), %xmm1
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    psubq %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1]
-; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    pandn %xmm0, %xmm5
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    por %xmm4, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm3
-; SSE2-NEXT:    por %xmm5, %xmm3
-; SSE2-NEXT:    psubq %xmm0, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1]
+; SSE2-NEXT:    por %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    psubq %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    psrlq $1, %xmm0
-; SSE2-NEXT:    psrlq $33, %xmm3
-; SSE2-NEXT:    pmuludq %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    psrlq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm0, %xmm4
-; SSE2-NEXT:    paddq %xmm3, %xmm4
-; SSE2-NEXT:    psllq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm2, %xmm0
+; SSE2-NEXT:    psrlq $33, %xmm2
+; SSE2-NEXT:    pmuludq %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    psrlq $32, %xmm3
+; SSE2-NEXT:    pmuludq %xmm0, %xmm3
+; SSE2-NEXT:    paddq %xmm2, %xmm3
+; SSE2-NEXT:    psllq $32, %xmm3
+; SSE2-NEXT:    pmuludq %xmm4, %xmm0
 ; SSE2-NEXT:    paddq %xmm1, %xmm0
-; SSE2-NEXT:    paddq %xmm4, %xmm0
+; SSE2-NEXT:    paddq %xmm3, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: vec128_i64_signed_mem_reg:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    movdqa (%rdi), %xmm2
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa (%rdi), %xmm1
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    pxor %xmm0, %xmm3
+; SSE41-NEXT:    psubq %xmm0, %xmm3
 ; SSE41-NEXT:    pxor %xmm2, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    pxor %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm6
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm5, %xmm0
-; SSE41-NEXT:    por %xmm4, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    psubq %xmm1, %xmm3
-; SSE41-NEXT:    psubq %xmm2, %xmm1
-; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; SSE41-NEXT:    movapd %xmm1, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm0, %xmm2
+; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm4 = [1,1]
+; SSE41-NEXT:    por %xmm2, %xmm4
+; SSE41-NEXT:    pxor %xmm2, %xmm3
+; SSE41-NEXT:    psubq %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    psrlq $1, %xmm0
-; SSE41-NEXT:    psrlq $33, %xmm1
-; SSE41-NEXT:    pmuludq %xmm6, %xmm1
-; SSE41-NEXT:    movdqa %xmm6, %xmm3
+; SSE41-NEXT:    psrlq $33, %xmm2
+; SSE41-NEXT:    pmuludq %xmm4, %xmm2
+; SSE41-NEXT:    movdqa %xmm4, %xmm3
 ; SSE41-NEXT:    psrlq $32, %xmm3
 ; SSE41-NEXT:    pmuludq %xmm0, %xmm3
-; SSE41-NEXT:    paddq %xmm1, %xmm3
+; SSE41-NEXT:    paddq %xmm2, %xmm3
 ; SSE41-NEXT:    psllq $32, %xmm3
-; SSE41-NEXT:    pmuludq %xmm6, %xmm0
-; SSE41-NEXT:    paddq %xmm2, %xmm0
+; SSE41-NEXT:    pmuludq %xmm4, %xmm0
+; SSE41-NEXT:    paddq %xmm1, %xmm0
 ; SSE41-NEXT:    paddq %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1317,9 +1281,9 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
 ; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm4
-; AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vblendvpd %xmm2, %xmm4, %xmm0, %xmm0
+; AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; AVX-NEXT:    vpsrlq $1, %xmm0, %xmm2
 ; AVX-NEXT:    vpsrlq $33, %xmm0, %xmm0
 ; AVX-NEXT:    vpmuludq %xmm3, %xmm0, %xmm0
@@ -1337,9 +1301,9 @@ define <2 x i64> @vec128_i64_signed_mem_reg(ptr %a1_addr, <2 x i64> %a2) nounwin
 ; XOP-NEXT:    vmovdqa (%rdi), %xmm1
 ; XOP-NEXT:    vpcomgtq %xmm0, %xmm1, %xmm2
 ; XOP-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; XOP-NEXT:    vpsubq %xmm0, %xmm1, %xmm4
-; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; XOP-NEXT:    vblendvpd %xmm2, %xmm4, %xmm0, %xmm0
+; XOP-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
+; XOP-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; XOP-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
 ; XOP-NEXT:    vpsrlq $1, %xmm0, %xmm2
 ; XOP-NEXT:    vpsrlq $33, %xmm0, %xmm0
 ; XOP-NEXT:    vpmuludq %xmm3, %xmm0, %xmm0
@@ -1442,15 +1406,10 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin
 ; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1]
 ; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm0, %xmm4
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    pandn %xmm1, %xmm5
-; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    por %xmm5, %xmm3
-; SSE2-NEXT:    psubq %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    psubq %xmm1, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm4
+; SSE2-NEXT:    psubq %xmm4, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; SSE2-NEXT:    psrlq $1, %xmm1
 ; SSE2-NEXT:    psrlq $33, %xmm3
@@ -1467,39 +1426,37 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin
 ;
 ; SSE41-LABEL: vec128_i64_signed_reg_mem:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    movdqa (%rdi), %xmm2
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    pxor %xmm0, %xmm3
-; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa (%rdi), %xmm1
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm3
+; SSE41-NEXT:    pxor %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm3, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm6
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; SSE41-NEXT:    pand %xmm5, %xmm0
-; SSE41-NEXT:    por %xmm4, %xmm0
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    psubq %xmm2, %xmm3
-; SSE41-NEXT:    psubq %xmm1, %xmm2
-; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
-; SSE41-NEXT:    movapd %xmm2, %xmm0
-; SSE41-NEXT:    psrlq $1, %xmm0
-; SSE41-NEXT:    psrlq $33, %xmm2
-; SSE41-NEXT:    pmuludq %xmm6, %xmm2
-; SSE41-NEXT:    movdqa %xmm6, %xmm3
-; SSE41-NEXT:    psrlq $32, %xmm3
-; SSE41-NEXT:    pmuludq %xmm0, %xmm3
-; SSE41-NEXT:    paddq %xmm2, %xmm3
-; SSE41-NEXT:    psllq $32, %xmm3
-; SSE41-NEXT:    pmuludq %xmm6, %xmm0
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pand %xmm5, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm2, %xmm3
+; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm2 = [1,1]
+; SSE41-NEXT:    por %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm0, %xmm4
+; SSE41-NEXT:    psubq %xmm1, %xmm4
+; SSE41-NEXT:    pxor %xmm3, %xmm4
+; SSE41-NEXT:    psubq %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm1
+; SSE41-NEXT:    psrlq $1, %xmm1
+; SSE41-NEXT:    psrlq $33, %xmm3
+; SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    psrlq $32, %xmm4
+; SSE41-NEXT:    pmuludq %xmm1, %xmm4
+; SSE41-NEXT:    paddq %xmm3, %xmm4
+; SSE41-NEXT:    psllq $32, %xmm4
+; SSE41-NEXT:    pmuludq %xmm2, %xmm1
 ; SSE41-NEXT:    paddq %xmm1, %xmm0
-; SSE41-NEXT:    paddq %xmm3, %xmm0
+; SSE41-NEXT:    paddq %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: vec128_i64_signed_reg_mem:
@@ -1507,9 +1464,9 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
+; AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpsrlq $1, %xmm1, %xmm2
 ; AVX-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
@@ -1527,9 +1484,9 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, ptr %a2_addr) nounwin
 ; XOP-NEXT:    vmovdqa (%rdi), %xmm1
 ; XOP-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm2
 ; XOP-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; XOP-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; XOP-NEXT:    vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; XOP-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm2
 ; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; XOP-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
@@ -1620,75 +1577,67 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; SSE2-NEXT:    movdqa (%rdi), %xmm1
 ; SSE2-NEXT:    movdqa (%rsi), %xmm0
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    psubq %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1]
-; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm3, %xmm5
-; SSE2-NEXT:    pandn %xmm0, %xmm5
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    por %xmm4, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm3
-; SSE2-NEXT:    por %xmm5, %xmm3
-; SSE2-NEXT:    psubq %xmm0, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-NEXT:    por %xmm0, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1]
+; SSE2-NEXT:    por %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    psubq %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    psrlq $1, %xmm0
-; SSE2-NEXT:    psrlq $33, %xmm3
-; SSE2-NEXT:    pmuludq %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    psrlq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm0, %xmm4
-; SSE2-NEXT:    paddq %xmm3, %xmm4
-; SSE2-NEXT:    psllq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm2, %xmm0
+; SSE2-NEXT:    psrlq $33, %xmm2
+; SSE2-NEXT:    pmuludq %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    psrlq $32, %xmm3
+; SSE2-NEXT:    pmuludq %xmm0, %xmm3
+; SSE2-NEXT:    paddq %xmm2, %xmm3
+; SSE2-NEXT:    psllq $32, %xmm3
+; SSE2-NEXT:    pmuludq %xmm4, %xmm0
 ; SSE2-NEXT:    paddq %xmm1, %xmm0
-; SSE2-NEXT:    paddq %xmm4, %xmm0
+; SSE2-NEXT:    paddq %xmm3, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: vec128_i64_signed_mem_mem:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa (%rdi), %xmm1
-; SSE41-NEXT:    movdqa (%rsi), %xmm2
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    pxor %xmm0, %xmm3
-; SSE41-NEXT:    pxor %xmm1, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE41-NEXT:    movdqa (%rsi), %xmm0
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = [2147483648,2147483648]
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psubq %xmm0, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE41-NEXT:    por %xmm3, %xmm6
-; SSE41-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm5, %xmm0
-; SSE41-NEXT:    por %xmm4, %xmm0
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    psubq %xmm2, %xmm3
-; SSE41-NEXT:    psubq %xmm1, %xmm2
-; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
-; SSE41-NEXT:    movapd %xmm2, %xmm0
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE41-NEXT:    por %xmm0, %xmm2
+; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm4 = [1,1]
+; SSE41-NEXT:    por %xmm2, %xmm4
+; SSE41-NEXT:    pxor %xmm2, %xmm3
+; SSE41-NEXT:    psubq %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    psrlq $1, %xmm0
 ; SSE41-NEXT:    psrlq $33, %xmm2
-; SSE41-NEXT:    pmuludq %xmm6, %xmm2
-; SSE41-NEXT:    movdqa %xmm6, %xmm3
+; SSE41-NEXT:    pmuludq %xmm4, %xmm2
+; SSE41-NEXT:    movdqa %xmm4, %xmm3
 ; SSE41-NEXT:    psrlq $32, %xmm3
 ; SSE41-NEXT:    pmuludq %xmm0, %xmm3
 ; SSE41-NEXT:    paddq %xmm2, %xmm3
 ; SSE41-NEXT:    psllq $32, %xmm3
-; SSE41-NEXT:    pmuludq %xmm6, %xmm0
+; SSE41-NEXT:    pmuludq %xmm4, %xmm0
 ; SSE41-NEXT:    paddq %xmm1, %xmm0
 ; SSE41-NEXT:    paddq %xmm3, %xmm0
 ; SSE41-NEXT:    retq
@@ -1699,9 +1648,9 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX-NEXT:    vmovdqa (%rsi), %xmm1
 ; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; AVX-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
+; AVX-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vpsrlq $1, %xmm1, %xmm2
 ; AVX-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
@@ -1720,9 +1669,9 @@ define <2 x i64> @vec128_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; XOP-NEXT:    vmovdqa (%rsi), %xmm1
 ; XOP-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm2
 ; XOP-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
-; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm4
-; XOP-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; XOP-NEXT:    vblendvpd %xmm2, %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; XOP-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
 ; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm2
 ; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; XOP-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
@@ -2389,10 +2338,8 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
 ; SSE2-NEXT:    por %xmm3, %xmm2
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    psubb %xmm1, %xmm4
-; SSE2-NEXT:    psubb %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm3
-; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    pxor %xmm3, %xmm4
+; SSE2-NEXT:    psubb %xmm4, %xmm3
 ; SSE2-NEXT:    psrlw $1, %xmm3
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm1
@@ -2852,10 +2799,8 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
 ; SSE2-NEXT:    por %xmm3, %xmm0
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; SSE2-NEXT:    psubb %xmm1, %xmm4
-; SSE2-NEXT:    psubb %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm3
-; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    pxor %xmm3, %xmm4
+; SSE2-NEXT:    psubb %xmm4, %xmm3
 ; SSE2-NEXT:    psrlw $1, %xmm3
 ; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm1
@@ -3083,30 +3028,28 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
 define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind {
 ; SSE2-LABEL: vec128_i8_signed_reg_mem:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm3
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    pcmpgtb %xmm3, %xmm2
+; SSE2-NEXT:    movdqa (%rdi), %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    por %xmm3, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    psubb %xmm3, %xmm4
-; SSE2-NEXT:    psubb %xmm0, %xmm3
-; SSE2-NEXT:    pand %xmm2, %xmm4
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    psrlw $1, %xmm2
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psubb %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm4
+; SSE2-NEXT:    psubb %xmm4, %xmm3
+; SSE2-NEXT:    psrlw $1, %xmm3
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    pmullw %xmm3, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT:    pand %xmm3, %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw %xmm2, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pmullw %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pmullw %xmm3, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    packuswb %xmm4, %xmm1
 ; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    retq
@@ -3321,30 +3264,28 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; SSE2-LABEL: vec128_i8_signed_mem_mem:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    movdqa (%rsi), %xmm3
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    pcmpgtb %xmm3, %xmm2
+; SSE2-NEXT:    movdqa (%rsi), %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtb %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm3, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    psubb %xmm3, %xmm4
-; SSE2-NEXT:    psubb %xmm1, %xmm3
-; SSE2-NEXT:    pand %xmm2, %xmm4
-; SSE2-NEXT:    pandn %xmm3, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    psrlw $1, %xmm2
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT:    psubb %xmm2, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm4
+; SSE2-NEXT:    psubb %xmm4, %xmm3
+; SSE2-NEXT:    psrlw $1, %xmm3
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    pmullw %xmm3, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT:    pand %xmm3, %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pmullw %xmm2, %xmm4
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pmullw %xmm2, %xmm0
-; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    pmullw %xmm3, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    packuswb %xmm4, %xmm0
 ; SSE2-NEXT:    paddb %xmm1, %xmm0
 ; SSE2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index cc08396ae8c7..92060aec3074 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -390,12 +390,12 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm6
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vblendvpd %xmm5, %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm6
-; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm6
 ; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm7
 ; AVX1-NEXT:    vpsrlq $33, %xmm1, %xmm1
@@ -427,9 +427,9 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
 ; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm3
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm4
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
@@ -448,12 +448,12 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; XOP-NEXT:    vpcomgtq %xmm2, %xmm3, %xmm4
 ; XOP-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm5
-; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm6
-; XOP-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; XOP-NEXT:    vblendvpd %xmm5, %xmm6, %xmm1, %xmm1
-; XOP-NEXT:    vpsubq %xmm2, %xmm3, %xmm6
-; XOP-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vblendvpd %xmm4, %xmm6, %xmm2, %xmm2
+; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; XOP-NEXT:    vpxor %xmm5, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; XOP-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; XOP-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; XOP-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; XOP-NEXT:    vpsrlq $1, %xmm2, %xmm6
 ; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm7
 ; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
@@ -561,25 +561,25 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
 ; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm6
 ; AVX1-NEXT:    vpxor %xmm4, %xmm0, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm4, %xmm4
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm6
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm6
-; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm3
-; AVX1-NEXT:    vblendvpd %xmm5, %xmm6, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
+; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm3
+; AVX1-NEXT:    vpxor %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpsubq %xmm3, %xmm5, %xmm3
 ; AVX1-NEXT:    vpsrlq $1, %xmm3, %xmm6
 ; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm7
+; AVX1-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm8 = [1,1]
 ; AVX1-NEXT:    vpor %xmm4, %xmm8, %xmm4
-; AVX1-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmuludq %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm9
 ; AVX1-NEXT:    vpmuludq %xmm7, %xmm9, %xmm9
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm9, %xmm1
 ; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
-; AVX1-NEXT:    vpor %xmm5, %xmm8, %xmm5
 ; AVX1-NEXT:    vpsrlq $33, %xmm3, %xmm3
+; AVX1-NEXT:    vpor %xmm5, %xmm8, %xmm5
 ; AVX1-NEXT:    vpmuludq %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm7
 ; AVX1-NEXT:    vpmuludq %xmm7, %xmm6, %xmm7
@@ -601,9 +601,9 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
 ; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
 ; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm3
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm4
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
@@ -622,12 +622,12 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; XOP-NEXT:    vpcomgtuq %xmm2, %xmm3, %xmm4
 ; XOP-NEXT:    vpcomgtuq %xmm1, %xmm0, %xmm5
-; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm6
-; XOP-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; XOP-NEXT:    vblendvpd %xmm5, %xmm6, %xmm1, %xmm1
-; XOP-NEXT:    vpsubq %xmm2, %xmm3, %xmm6
-; XOP-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vblendvpd %xmm4, %xmm6, %xmm2, %xmm2
+; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; XOP-NEXT:    vpxor %xmm5, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; XOP-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; XOP-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; XOP-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; XOP-NEXT:    vpsrlq $1, %xmm2, %xmm6
 ; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm7
 ; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
@@ -732,12 +732,12 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm6
-; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm5, %xmm6, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm6
-; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm6
 ; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm7
 ; AVX1-NEXT:    vpsrlq $33, %xmm0, %xmm0
@@ -770,9 +770,9 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
 ; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
 ; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm3
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm4
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm2
 ; AVX2-NEXT:    vpsrlq $33, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm0
@@ -792,12 +792,12 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
 ; XOP-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; XOP-NEXT:    vpcomgtq %xmm1, %xmm3, %xmm4
 ; XOP-NEXT:    vpcomgtq %xmm0, %xmm2, %xmm5
-; XOP-NEXT:    vpsubq %xmm0, %xmm2, %xmm6
-; XOP-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
-; XOP-NEXT:    vblendvpd %xmm5, %xmm6, %xmm0, %xmm0
-; XOP-NEXT:    vpsubq %xmm1, %xmm3, %xmm6
-; XOP-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
-; XOP-NEXT:    vblendvpd %xmm4, %xmm6, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    vpxor %xmm5, %xmm0, %xmm0
+; XOP-NEXT:    vpsubq %xmm0, %xmm5, %xmm0
+; XOP-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
+; XOP-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
 ; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm6
 ; XOP-NEXT:    vpsrlq $1, %xmm0, %xmm7
 ; XOP-NEXT:    vpsrlq $33, %xmm0, %xmm0
@@ -902,12 +902,12 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm5
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm6
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vblendvpd %xmm5, %xmm6, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm6
-; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpsrlq $1, %xmm2, %xmm6
 ; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm7
 ; AVX1-NEXT:    vpsrlq $33, %xmm1, %xmm1
@@ -940,9 +940,9 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
 ; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm3
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm4
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
@@ -962,12 +962,12 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; XOP-NEXT:    vpcomgtq %xmm2, %xmm3, %xmm4
 ; XOP-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm5
-; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm6
-; XOP-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; XOP-NEXT:    vblendvpd %xmm5, %xmm6, %xmm1, %xmm1
-; XOP-NEXT:    vpsubq %xmm2, %xmm3, %xmm6
-; XOP-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vblendvpd %xmm4, %xmm6, %xmm2, %xmm2
+; XOP-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
+; XOP-NEXT:    vpxor %xmm5, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm5, %xmm1
+; XOP-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
+; XOP-NEXT:    vpxor %xmm4, %xmm2, %xmm2
+; XOP-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
 ; XOP-NEXT:    vpsrlq $1, %xmm2, %xmm6
 ; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm7
 ; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
@@ -1073,12 +1073,12 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
-; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm6
-; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vblendvpd %xmm5, %xmm6, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm6
-; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vblendvpd %xmm4, %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubq %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm6
 ; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm7
 ; AVX1-NEXT:    vpsrlq $33, %xmm0, %xmm0
@@ -1112,9 +1112,9 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
 ; AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm3
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm4
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm2
 ; AVX2-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
@@ -1135,12 +1135,12 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; XOP-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; XOP-NEXT:    vpcomgtq %xmm1, %xmm3, %xmm4
 ; XOP-NEXT:    vpcomgtq %xmm0, %xmm2, %xmm5
-; XOP-NEXT:    vpsubq %xmm0, %xmm2, %xmm6
-; XOP-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
-; XOP-NEXT:    vblendvpd %xmm5, %xmm6, %xmm0, %xmm0
-; XOP-NEXT:    vpsubq %xmm1, %xmm3, %xmm6
-; XOP-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
-; XOP-NEXT:    vblendvpd %xmm4, %xmm6, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    vpxor %xmm5, %xmm0, %xmm0
+; XOP-NEXT:    vpsubq %xmm0, %xmm5, %xmm0
+; XOP-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
+; XOP-NEXT:    vpxor %xmm4, %xmm1, %xmm1
+; XOP-NEXT:    vpsubq %xmm1, %xmm4, %xmm1
 ; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm6
 ; XOP-NEXT:    vpsrlq $1, %xmm0, %xmm7
 ; XOP-NEXT:    vpsrlq $33, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/misched-critical-path.ll b/llvm/test/CodeGen/X86/misched-critical-path.ll
new file mode 100644
index 000000000000..2a95aaa46d4a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/misched-critical-path.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin8 -misched-print-dags -o - 2>&1 > /dev/null | FileCheck %s
+; REQUIRES: asserts
+
+@sc = common global i8 0
+@uc = common global i8 0
+@ui = common global i32 0
+
+; Regression Test for PR92368.
+;
+; CHECK: SU(8):   CMP8rr %4:gr8, %3:gr8, implicit-def $eflags
+; CHECK:   Predecessors:
+; CHECK-NEXT:    SU(6): Data Latency=0 Reg=%4
+; CHECK-NEXT:    SU(7): Out  Latency=0
+; CHECK-NEXT:    SU(5): Out  Latency=0
+; CHECK-NEXT:    SU(3): Data Latency=4 Reg=%3
+define void @misched_bug() nounwind {
+entry:
+  %v0 = load i8, ptr @sc, align 1
+  %v1 = zext i8 %v0 to i32
+  %v2 = load i8, ptr @uc, align 1
+  %v3 = zext i8 %v2 to i32
+  %v4 = trunc i32 %v3 to i8
+  %v5 = trunc i32 %v1 to i8
+  %pair74 = cmpxchg ptr @sc, i8 %v4, i8 %v5 monotonic monotonic
+  %v6 = extractvalue { i8, i1 } %pair74, 0
+  %v7 = icmp eq i8 %v6, %v4
+  %v8 = zext i1 %v7 to i8
+  %v9 = zext i8 %v8 to i32
+  store i32 %v9, ptr @ui, align 4
+  br label %return
+
+return:                                           ; preds = %ventry
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 43589dc993da..3f57a03decd0 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -197,8 +197,6 @@
 ; CHECK-NEXT:       BreakFalseDeps
 ; CHECK-NEXT:       X86 Indirect Branch Tracking
 ; CHECK-NEXT:       X86 vzeroupper inserter
-; CHECK-NEXT:       MachineDominator Tree Construction
-; CHECK-NEXT:       Machine Natural Loop Construction
 ; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       X86 Byte/Word Instruction Fixup
 ; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index dcded7a877ab..1f82c4a5a2d9 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -1173,13 +1173,14 @@ define <4 x i32> @mul_v4i64_zero_lower(<4 x i32> %val1, <4 x i64> %val2) {
 ;
 ; SSE41-LABEL: mul_v4i64_zero_lower:
 ; SSE41:       # %bb.0: # %entry
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
 ; SSE41-NEXT:    psrlq $32, %xmm2
-; SSE41-NEXT:    pmuludq %xmm3, %xmm2
+; SSE41-NEXT:    pmuludq %xmm0, %xmm2
 ; SSE41-NEXT:    psrlq $32, %xmm1
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT:    pmuludq %xmm1, %xmm0
-; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; SSE41-NEXT:    pmuludq %xmm1, %xmm3
+; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2]
+; SSE41-NEXT:    movaps %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: mul_v4i64_zero_lower:
diff --git a/llvm/test/CodeGen/X86/pr59305.ll b/llvm/test/CodeGen/X86/pr59305.ll
index 4d59192fdc4d..46c9da5a5193 100644
--- a/llvm/test/CodeGen/X86/pr59305.ll
+++ b/llvm/test/CodeGen/X86/pr59305.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s --check-prefix=X64
-; RUN: llc -mtriple=i686-pc-linux < %s | FileCheck %s --check-prefix=X86
+; RUN: sed -e "s/SETROUND/ldmxcsr/g" %s | llc -mtriple=x86_64-pc-linux - | FileCheck %s --check-prefix=X64
+; RUN: sed -e "s/SETROUND/fldcw/g" %s | llc -mtriple=i686-pc-linux - | FileCheck %s --check-prefix=X86
 
 define double @foo(double %0) #0 {
 ; X64-LABEL: foo:
@@ -74,6 +74,71 @@ define double @foo(double %0) #0 {
     ret double %8
 }
 
+define double @bar(double %0) #0 {
+; X64-LABEL: bar:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    #APP
+; X64-NEXT:    ldmxcsr 0
+; X64-NEXT:    #NO_APP
+; X64-NEXT:    wait
+; X64-NEXT:    movsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0]
+; X64-NEXT:    movapd %xmm2, %xmm3
+; X64-NEXT:    divsd %xmm0, %xmm3
+; X64-NEXT:    #APP
+; X64-NEXT:    ldmxcsr 0
+; X64-NEXT:    #NO_APP
+; X64-NEXT:    wait
+; X64-NEXT:    movapd %xmm2, %xmm1
+; X64-NEXT:    divsd %xmm0, %xmm1
+; X64-NEXT:    #APP
+; X64-NEXT:    ldmxcsr 0
+; X64-NEXT:    #NO_APP
+; X64-NEXT:    wait
+; X64-NEXT:    divsd %xmm0, %xmm2
+; X64-NEXT:    movapd %xmm3, %xmm0
+; X64-NEXT:    callq fma@PLT
+; X64-NEXT:    popq %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: bar:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $28, %esp
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    #APP
+; X86-NEXT:    fldcw 0
+; X86-NEXT:    #NO_APP
+; X86-NEXT:    fld1
+; X86-NEXT:    fld %st(0)
+; X86-NEXT:    fdiv %st(2), %st
+; X86-NEXT:    #APP
+; X86-NEXT:    fldcw 0
+; X86-NEXT:    #NO_APP
+; X86-NEXT:    fld %st(1)
+; X86-NEXT:    fdiv %st(3), %st
+; X86-NEXT:    #APP
+; X86-NEXT:    fldcw 0
+; X86-NEXT:    #NO_APP
+; X86-NEXT:    fxch %st(2)
+; X86-NEXT:    fdivp %st, %st(3)
+; X86-NEXT:    fxch %st(2)
+; X86-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpl (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll fma
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    retl
+    call void asm sideeffect "SETROUND $0", "*m,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) null)
+    %2 = call double @llvm.experimental.constrained.fdiv.f64(double 1.000000e+00, double %0, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+    call void asm sideeffect "SETROUND $0", "*m,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) null)
+    %3 = call double @llvm.experimental.constrained.fdiv.f64(double 1.000000e+00, double %0, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+    call void asm sideeffect "SETROUND $0", "*m,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) null)
+    %4 = call double @llvm.experimental.constrained.fdiv.f64(double 1.000000e+00, double %0, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+    %5 = call double @llvm.experimental.constrained.fma.f64(double %2, double %3, double %4, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+    ret double %5
+}
+
 declare i32 @fesetround(i32) #0
 declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata) #0
 declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata) #0
diff --git a/llvm/test/CodeGen/X86/pr90703.ll b/llvm/test/CodeGen/X86/pr90703.ll
new file mode 100644
index 000000000000..c02342ffeec1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr90703.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+bmi | FileCheck %s
+
+define i64 @pr90730(i64 %x, i64 %y, ptr %p) {
+; CHECK-LABEL: pr90730:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movabsq $33181731808, %rax # imm = 0x7B9C90BE0
+; CHECK-NEXT:    andnq %rax, %rdi, %rax
+; CHECK-NEXT:    movq $0, (%rdx)
+; CHECK-NEXT:    retq
+entry:
+  %ext = and i64 %y, 1
+  %xor1 = xor i64 %ext, 33181731817
+  %and1 = and i64 %xor1, %x
+  store i64 %and1, ptr %p, align 4
+  %v = load i64, ptr %p, align 4
+  %and2 = and i64 %v, 33181731808
+  %xor2 = xor i64 %and2, 33181731808
+  store i64 0, ptr %p, align 4
+  ret i64 %xor2
+}
diff --git a/llvm/test/CodeGen/X86/pr90844.ll b/llvm/test/CodeGen/X86/pr90844.ll
index 6feece7f66d8..b250c3f6f9a2 100644
--- a/llvm/test/CodeGen/X86/pr90844.ll
+++ b/llvm/test/CodeGen/X86/pr90844.ll
@@ -17,3 +17,20 @@ entry:
   store <2 x i64> %5, ptr poison, align 16
   ret void
 }
+
+define void @foo(ptr %0) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    vpxor 32(%rdi), %ymm0, %ymm1
+; CHECK-NEXT:    vpxor (%rdi), %ymm0, %ymm0
+; CHECK-NEXT:    vmovdqa %ymm0, (%rdi)
+; CHECK-NEXT:    vmovdqa %ymm1, 32(%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %1 = load <32 x half>, ptr %0
+  %2 = fneg <32 x half> %1
+  store <32 x half> %2, ptr %0
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/pr92720.ll b/llvm/test/CodeGen/X86/pr92720.ll
new file mode 100644
index 000000000000..b2543c08328c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr92720.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s
+
+; Make sure we don't crash when shrinking the shift amount before legalization.
+define i64 @pr92720(i64 %x) {
+; CHECK-LABEL: pr92720:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movabsq $8589934592, %rax # imm = 0x200000000
+; CHECK-NEXT:    retq
+  %or = or i64 %x, 255
+  %sub = sub i64 0, %or
+  %shl = shl i64 1, %sub
+  %sext = shl i64 %shl, 32
+  ret i64 %sext
+}
diff --git a/llvm/test/CodeGen/X86/pr93000.ll b/llvm/test/CodeGen/X86/pr93000.ll
new file mode 100644
index 000000000000..0bd5da48847e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr93000.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=x86_64- -mcpu=x86-64-v4 | FileCheck %s
+
+define void @PR93000(ptr %a0, ptr %a1, ptr %a2, <32 x i16> %a3) {
+; CHECK-LABEL: PR93000:
+; CHECK:       # %bb.0: # %Entry
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    addq $4, %rdi
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_1: # %Loop
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    knotd %k1, %k2
+; CHECK-NEXT:    vpblendmw (%rsi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqu16 (%rdx), %zmm1 {%k2}
+; CHECK-NEXT:    vmovdqu64 %zmm1, (%rsi)
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    addq $4, %rdi
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %Then
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+Entry:
+  %pre = load i32, ptr %a0, align 4
+  br label %Loop
+
+Loop:                                             ; preds = %Loop, %Entry
+  %p = phi i32 [ %limit, %Loop ], [ %pre, %Entry ]
+  %lsr.iv.pn = phi ptr [ %lsr.iv, %Loop ], [ %a0, %Entry ]
+  %lsr.iv = getelementptr i8, ptr %lsr.iv.pn, i64 4
+  %pn = xor i32 %p, -1
+  %m = bitcast i32 %p to <32 x i1>
+  %mn = bitcast i32 %pn to <32 x i1>
+  %mload0 = tail call <32 x i16> @llvm.masked.load.v32i16.p0(ptr %a1, i32 2, <32 x i1> %m, <32 x i16> %a3)
+  %mload1 = tail call <32 x i16> @llvm.masked.load.v32i16.p0(ptr %a2, i32 2, <32 x i1> %mn, <32 x i16> %mload0)
+  store <32 x i16> %mload1, ptr %a1, align 2
+  %limit = load i32, ptr %lsr.iv, align 4
+  %icmp = icmp eq i32 %limit, 0
+  br i1 %icmp, label %Then, label %Loop
+
+Then:                                             ; preds = %Loop
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/prefetch.ll b/llvm/test/CodeGen/X86/prefetch.ll
index 404d49b63f25..c10e0526787d 100644
--- a/llvm/test/CodeGen/X86/prefetch.ll
+++ b/llvm/test/CodeGen/X86/prefetch.ll
@@ -6,9 +6,6 @@
 ; RUN: llc < %s -mtriple=i686-- -mcpu=slm | FileCheck %s -check-prefix=X86-PRFCHWSSE
 ; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 | FileCheck %s -check-prefix=X86-PRFCHWSSE
 ; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 -mattr=-prfchw | FileCheck %s -check-prefix=X86-SSE
-; RUN: llc < %s -mtriple=i686-- -mattr=+sse,+prefetchwt1 | FileCheck %s -check-prefix=X86-PREFETCHWT1
-; RUN: llc < %s -mtriple=i686-- -mattr=-sse,+prefetchwt1 | FileCheck %s -check-prefix=X86-PREFETCHWT1
-; RUN: llc < %s -mtriple=i686-- -mattr=-sse,+3dnow,+prefetchwt1 | FileCheck %s -check-prefix=X86-PREFETCHWT1
 ; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow | FileCheck %s -check-prefix=X86-3DNOW
 ; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow,+prfchw | FileCheck %s -check-prefix=X86-3DNOW
 
@@ -16,7 +13,6 @@
 ; 3dnow by itself get you just the single prefetch instruction with no hints
 ; sse provides prefetch0/1/2/nta
 ; supporting prefetchw, but not 3dnow implicitly provides prefetcht0/1/2/nta regardless of sse setting as we need something to fall back to for the non-write hint.
-; supporting prefetchwt1 implies prefetcht0/1/2/nta and prefetchw regardless of other settings. this allows levels for non-write and gives us an instruction for write+T0
 ; 3dnow prefetch instruction will only get used if you have no other prefetch instructions enabled
 
 ; rdar://10538297
@@ -48,19 +44,6 @@ define void @t(ptr %ptr) nounwind  {
 ; X86-PRFCHWSSE-NEXT:    prefetchw (%eax)
 ; X86-PRFCHWSSE-NEXT:    retl
 ;
-; X86-PREFETCHWT1-LABEL: t:
-; X86-PREFETCHWT1:       # %bb.0: # %entry
-; X86-PREFETCHWT1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-PREFETCHWT1-NEXT:    prefetcht2 (%eax)
-; X86-PREFETCHWT1-NEXT:    prefetcht1 (%eax)
-; X86-PREFETCHWT1-NEXT:    prefetcht0 (%eax)
-; X86-PREFETCHWT1-NEXT:    prefetchnta (%eax)
-; X86-PREFETCHWT1-NEXT:    prefetchwt1 (%eax)
-; X86-PREFETCHWT1-NEXT:    prefetchwt1 (%eax)
-; X86-PREFETCHWT1-NEXT:    prefetchw (%eax)
-; X86-PREFETCHWT1-NEXT:    prefetchwt1 (%eax)
-; X86-PREFETCHWT1-NEXT:    retl
-;
 ; X86-3DNOW-LABEL: t:
 ; X86-3DNOW:       # %bb.0: # %entry
 ; X86-3DNOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index 62051d170994..f3f7f0515e30 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -1863,7 +1863,7 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl c, %edx
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    psrld $16, %xmm0
+; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
 ; X86-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    psllq $32, %xmm0
 ; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
@@ -1884,7 +1884,7 @@ define void @mul_2xi16_varconst3(ptr nocapture readonly %a, i64 %index) {
 ; X64-SSE:       # %bb.0: # %entry
 ; X64-SSE-NEXT:    movq c(%rip), %rax
 ; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT:    psrld $16, %xmm0
+; X64-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
 ; X64-SSE-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-SSE-NEXT:    psllq $32, %xmm0
 ; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
diff --git a/llvm/test/CodeGen/X86/speculative-load-hardening-gather.ll b/llvm/test/CodeGen/X86/speculative-load-hardening-gather.ll
index 6e89445bead6..7b3667420ec6 100644
--- a/llvm/test/CodeGen/X86/speculative-load-hardening-gather.ll
+++ b/llvm/test/CodeGen/X86/speculative-load-hardening-gather.ll
@@ -558,28 +558,6 @@ entry:
   ret <8 x i64> %v
 }
 
-declare void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, ptr, i32, i32);
-
-define void @test_llvm_x86_avx512_gatherpf_qps_512(<8 x i64> %iv, ptr %b) #1 {
-; CHECK-LABEL: test_llvm_x86_avx512_gatherpf_qps_512:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsp, %rax
-; CHECK-NEXT:    movq $-1, %rcx
-; CHECK-NEXT:    sarq $63, %rax
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
-; CHECK-NEXT:    orq %rax, %rdi
-; CHECK-NEXT:    vpbroadcastq %rax, %zmm1
-; CHECK-NEXT:    vporq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    vgatherpf0qps (%rdi,%zmm0,4) {%k1}
-; CHECK-NEXT:    shlq $47, %rax
-; CHECK-NEXT:    orq %rax, %rsp
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
-entry:
-  call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %iv, ptr %b, i32 4, i32 3)
-  ret void
-}
-
 declare <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float>, ptr, <4 x i32>, i8, i32)
 
 define <4 x float> @test_llvm_x86_avx512_gather3siv4_sf(ptr %b, <4 x i32> %iv) #2 {
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
index e4eca6b744af..ed7109c416e7 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
@@ -265,7 +265,6 @@ define i32 @stack_fold_fpclassph_mask(<32 x half> %a0, ptr %p) {
 }
 
 define i8 @stack_fold_fpclasssh(<8 x half> %a0) {
-  ;CHECK-LABEl: stack_fold_fpclasssh:
 ; CHECK-LABEL: stack_fold_fpclasssh:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
diff --git a/llvm/test/CodeGen/X86/stack-frame-layout-remarks.ll b/llvm/test/CodeGen/X86/stack-frame-layout-remarks.ll
index d32a37efcb5a..cd5edcf2ae50 100644
--- a/llvm/test/CodeGen/X86/stack-frame-layout-remarks.ll
+++ b/llvm/test/CodeGen/X86/stack-frame-layout-remarks.ll
@@ -35,7 +35,7 @@ entry:
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
 
 ; BOTH: Function: cleanup_array
-; BOTH-Next:  Offset: [SP+4], Type: Protector, Align: 16, Size: 4
+; BOTH-NEXT:  Offset: [SP+4], Type: Protector, Align: 16, Size: 4
 ; DEBUG: a @ dot.c:13
 ; STRIPPED-NOT: a @ dot.c:13
 ; BOTH:  Offset: [SP-4], Type: Spill, Align: 8, Size: 4
diff --git a/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir b/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir
index 4c715b894fae..af57d972f224 100644
--- a/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir
+++ b/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir
@@ -23,7 +23,7 @@
     br i1 %6, label %4, label %5, !llvm.loop !9
   }
 
-  attributes #0 = { nofree norecurse nosync nounwind uwtable writeonly mustprogress "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="generic" }
+  attributes #0 = { nofree norecurse nosync nounwind uwtable writeonly mustprogress "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="generic" }
 
   !llvm.module.flags = !{!0, !1}
   !llvm.ident = !{!2}
diff --git a/llvm/test/DebugInfo/X86/debug-names-types.ll b/llvm/test/DebugInfo/X86/debug-names-types.ll
index ff0d4d52c1f0..81016e3874ee 100644
--- a/llvm/test/DebugInfo/X86/debug-names-types.ll
+++ b/llvm/test/DebugInfo/X86/debug-names-types.ll
@@ -48,11 +48,6 @@
 ; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
 ; CHECK-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-NEXT:     }
-; CHECK-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
-; CHECK-NEXT:       Tag: DW_TAG_structure_type
-; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
-; CHECK-NEXT:       DW_IDX_parent: DW_FORM_flag_present
-; CHECK-NEXT:     }
 ; CHECK-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
 ; CHECK-NEXT:       Tag: DW_TAG_subprogram
 ; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
@@ -88,12 +83,6 @@
 ; CHECK-NEXT:           DW_IDX_die_offset: 0x00000023
 ; CHECK-NEXT:           DW_IDX_parent: <parent not indexed>
 ; CHECK-NEXT:       }
-; CHECK-NEXT:       Entry @ {{.+}} {
-; CHECK-NEXT:         Abbrev: [[ABBREV1]]
-; CHECK-NEXT:         Tag: DW_TAG_structure_type
-; CHECK-NEXT:         DW_IDX_die_offset: 0x00000042
-; CHECK-NEXT:         DW_IDX_parent: <parent not indexed>
-; CHECK-NEXT:       }
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:   ]
 ; CHECK-NEXT:   Bucket 2 [
@@ -130,7 +119,7 @@
 ; CHECK-SPLIT:          Foreign TU count: 1
 ; CHECK-SPLIT-NEXT:     Bucket count: 4
 ; CHECK-SPLIT-NEXT:     Name count: 4
-; CHECK-SPLIT-NEXT:     Abbreviations table size: 0x2D
+; CHECK-SPLIT-NEXT:     Abbreviations table size: 0x25
 ; CHECK-SPLIT-NEXT:     Augmentation: 'LLVM0700'
 ; CHECK-SPLIT-NEXT:   }
 ; CHECK-SPLIT-NEXT:   Compilation Unit offsets [
@@ -151,11 +140,6 @@
 ; CHECK-SPLIT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
 ; CHECK-SPLIT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-SPLIT-NEXT:     }
-; CHECK-SPLIT-NEXT:     Abbreviation [[ABBREV:0x[0-9a-f]*]] {
-; CHECK-SPLIT-NEXT:       Tag: DW_TAG_structure_type
-; CHECK-SPLIT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
-; CHECK-SPLIT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
-; CHECK-SPLIT-NEXT:     }
 ; CHECK-SPLIT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
 ; CHECK-SPLIT-NEXT:       Tag: DW_TAG_subprogram
 ; CHECK-SPLIT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
@@ -191,12 +175,6 @@
 ; CHECK-SPLIT-NEXT:         DW_IDX_die_offset: 0x00000021
 ; CHECK-SPLIT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; CHECK-SPLIT-NEXT:       }
-; CHECK-SPLIT-NEXT:       Entry @ {{.*}} {
-; CHECK-SPLIT-NEXT:         Abbrev: [[ABBREV]]
-; CHECK-SPLIT-NEXT:         Tag: DW_TAG_structure_type
-; CHECK-SPLIT-NEXT:         DW_IDX_die_offset: 0x00000039
-; CHECK-SPLIT-NEXT:         DW_IDX_parent: <parent not indexed>
-; CHECK-SPLIT-NEXT:       }
 ; CHECK-SPLIT-NEXT:     }
 ; CHECK-SPLIT-NEXT:   ]
 ; CHECK-SPLIT-NEXT:   Bucket 2 [
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll b/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll
new file mode 100644
index 000000000000..c0e370f20213
--- /dev/null
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/mem-attr.ll
@@ -0,0 +1,15 @@
+; Test that HWASan remove writeonly and memory(*) attributes from instrumented functions.
+; RUN: opt -S -passes=hwasan %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-linux-android30"
+
+; CHECK: define dso_local void @test_writeonly(ptr nocapture noundef %p) local_unnamed_addr #0
+define dso_local void @test_writeonly(ptr nocapture noundef writeonly %p) local_unnamed_addr #0 {
+entry:
+  store i32 42, ptr %p, align 4
+  ret void
+}
+
+; CHECK: attributes #0 = { sanitize_hwaddress uwtable }
+attributes #0 = { sanitize_hwaddress memory(argmem: write) uwtable }
diff --git a/llvm/test/Linker/darwin-target-variant.ll b/llvm/test/Linker/darwin-target-variant.ll
new file mode 100644
index 000000000000..7d46b2dda4a9
--- /dev/null
+++ b/llvm/test/Linker/darwin-target-variant.ll
@@ -0,0 +1,42 @@
+; RUN: rm -rf %t && split-file %s %t
+; RUN: llvm-link %t/1.ll %t/2.ll -S -o - | FileCheck %s
+; CHECK: {i32 2, !"darwin.target_variant.triple", !"x86_64-apple-ios13.1-macabi"}
+
+; RUN: llvm-link %t/1.ll %t/old.ll -S -o - | FileCheck %s -check-prefix OLD
+; OLD: {i32 4, !"darwin.target_variant.triple", !"x86_64-apple-ios14.0-macabi"}
+
+;--- 1.ll
+target triple = "x86_64-apple-macos10.15";
+!llvm.module.flags = !{!0, !1, !2};
+!0 = !{i32 2, !"SDK Version", [3 x i32] [ i32 10, i32 15, i32 1 ] };
+!1 = !{i32 2, !"darwin.target_variant.triple", !"x86_64-apple-ios13.1-macabi"};
+!2 = !{i32 2, !"darwin.target_variant.SDK Version", [2 x i32] [ i32 13, i32 2 ] };
+
+define void @foo() {
+entry:
+  ret void
+}
+
+;--- 2.ll
+target triple = "x86_64-apple-macos10.15";
+!llvm.module.flags = !{!0, !1, !2};
+!0 = !{i32 2, !"SDK Version", [3 x i32] [ i32 10, i32 15, i32 1 ] };
+!1 = !{i32 2, !"darwin.target_variant.triple", !"x86_64-apple-ios14.0-macabi"};
+!2 = !{i32 2, !"darwin.target_variant.SDK Version", [2 x i32] [ i32 13, i32 2 ] };
+
+define void @bar() {
+entry:
+  ret void
+}
+
+;--- old.ll
+target triple = "x86_64-apple-macos10.15";
+!llvm.module.flags = !{!0, !1, !2};
+!0 = !{i32 2, !"SDK Version", [3 x i32] [ i32 10, i32 15, i32 1 ] };
+!1 = !{i32 4, !"darwin.target_variant.triple", !"x86_64-apple-ios14.0-macabi"};
+!2 = !{i32 2, !"darwin.target_variant.SDK Version", [2 x i32] [ i32 13, i32 2 ] };
+
+define void @old() {
+entry:
+  ret void
+}
diff --git a/llvm/test/MC/AArch64/FP8/system-regs.s b/llvm/test/MC/AArch64/FP8/system-regs.s
index 4a396d4dff82..8959a7727b19 100644
--- a/llvm/test/MC/AArch64/FP8/system-regs.s
+++ b/llvm/test/MC/AArch64/FP8/system-regs.s
@@ -1,11 +1,9 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+fpmr < %s \
+// RUN: llvm-mc -triple=aarch64 -show-encoding  < %s \
 // RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
-// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+fpmr < %s \
-// RUN:        | llvm-objdump -d --mattr=+fpmr - | FileCheck %s --check-prefix=CHECK-INST
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+fpmr < %s \
-// RUN:        | llvm-objdump --mattr=-fpmr -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// RUN: llvm-mc -triple=aarch64 -filetype=obj  < %s \
+// RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj  < %s \
+// RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
 
 // --------------------------------------------------------------------------//
 // read
@@ -13,14 +11,13 @@
 mrs x3, FPMR
 // CHECK-INST: mrs x3, FPMR
 // CHECK-ENCODING: [0x43,0x44,0x3b,0xd5]
-// CHECK-ERROR: expected readable system register
-// CHECK-UNKNOWN: d53b4443   mrs   x3, S3_3_C4_C4_2
+// CHECK-UNKNOWN: d53b4443   mrs   x3, FPMR
+
 
 mrs x3, ID_AA64FPFR0_EL1
 // CHECK-INST: mrs x3, ID_AA64FPFR0_EL1
 // CHECK-ENCODING: [0xe3,0x04,0x38,0xd5]
-// CHECK-ERROR: expected readable system register
-// CHECK-UNKNOWN: d53804e3   mrs   x3, S3_0_C0_C4_7
+// CHECK-UNKNOWN: d53804e3   mrs   x3, ID_AA64FPFR0_EL1
 
 // --------------------------------------------------------------------------//
 // write
@@ -28,5 +25,4 @@ mrs x3, ID_AA64FPFR0_EL1
 msr FPMR, x3
 // CHECK-INST: msr FPMR, x3
 // CHECK-ENCODING: [0x43,0x44,0x1b,0xd5]
-// CHECK-ERROR: expected writable system register or pstate
-// CHECK-UNKNOWN: d51b4443   msr   S3_3_C4_C4_2, x3
+// CHECK-UNKNOWN: d51b4443   msr   FPMR, x3
+\ No newline at end of file
diff --git a/llvm/test/MC/AArch64/SVE/condtion-codes.s b/llvm/test/MC/AArch64/SVE/condition-codes.s
index c1d8e2ad715d..c1d8e2ad715d 100644
--- a/llvm/test/MC/AArch64/SVE/condtion-codes.s
+++ b/llvm/test/MC/AArch64/SVE/condition-codes.s
diff --git a/llvm/test/MC/AArch64/SVE/sqdecd-diagnostics.s b/llvm/test/MC/AArch64/SVE/sqdecd-diagnostics.s
index 658af848c363..96b14b9ec112 100644
--- a/llvm/test/MC/AArch64/SVE/sqdecd-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE/sqdecd-diagnostics.s
@@ -18,9 +18,9 @@ sqdecd sp
 // CHECK-NEXT: sqdecd sp
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
-uqdecd z0.s
+sqdecd z0.s
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
-// CHECK-NEXT: uqdecd z0.s
+// CHECK-NEXT: sqdecd z0.s
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
 
diff --git a/llvm/test/MC/AArch64/SVE/sqincp-diagnostics.s b/llvm/test/MC/AArch64/SVE/sqincp-diagnostics.s
index 2dfd49584908..862af7c9203b 100644
--- a/llvm/test/MC/AArch64/SVE/sqincp-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE/sqincp-diagnostics.s
@@ -3,48 +3,48 @@
 // ------------------------------------------------------------------------- //
 // Invalid result register
 
-uqdecp sp, p0
+sqincp sp, p0
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand
-// CHECK-NEXT: uqdecp sp, p0
+// CHECK-NEXT: sqincp sp, p0
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
-uqdecp z0.b, p0
+sqincp z0.b, p0
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
-// CHECK-NEXT: uqdecp z0.b, p0
+// CHECK-NEXT: sqincp z0.b, p0
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
-uqdecp x0, p0.b, w0
+sqincp w0, p0.b, w0
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand
-// CHECK-NEXT: uqdecp x0, p0.b, w0
+// CHECK-NEXT: sqincp w0, p0.b, w0
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
-uqdecp x0, p0.b, x1
+sqincp x0, p0.b, x1
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand
-// CHECK-NEXT: uqdecp x0, p0.b, x1
+// CHECK-NEXT: sqincp x0, p0.b, x1
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
 
 // ------------------------------------------------------------------------- //
 // Invalid predicate operand
 
-uqdecp x0, p0
+sqincp x0, p0
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid predicate register
-// CHECK-NEXT: uqdecp x0, p0
+// CHECK-NEXT: sqincp x0, p0
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
-uqdecp x0, p0/z
+sqincp x0, p0/z
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid predicate register
-// CHECK-NEXT: uqdecp x0, p0/z
+// CHECK-NEXT: sqincp x0, p0/z
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
-uqdecp x0, p0/m
+sqincp x0, p0/m
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid predicate register
-// CHECK-NEXT: uqdecp x0, p0/m
+// CHECK-NEXT: sqincp x0, p0/m
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
-uqdecp x0, p0.q
+sqincp x0, p0.q
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid predicate register
-// CHECK-NEXT: uqdecp x0, p0.q
+// CHECK-NEXT: sqincp x0, p0.q
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
 sqincp z0.d, p0.b
diff --git a/llvm/test/MC/AMDGPU/amd_kernel_code_t.s b/llvm/test/MC/AMDGPU/amd_kernel_code_t.s
new file mode 100644
index 000000000000..052ec0bfabb8
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/amd_kernel_code_t.s
@@ -0,0 +1,171 @@
+; RUN: llvm-mc -triple=amdgcn-mesa-mesa3d -mcpu=gfx900 -filetype=asm < %s | FileCheck --check-prefix=ASM %s
+; RUN: llvm-mc -triple=amdgcn-mesa-mesa3d -mcpu=gfx900 -filetype=obj < %s > %t
+; RUN: llvm-objdump -s %t | FileCheck --check-prefix=OBJDUMP %s
+
+; OBJDUMP: Contents of section .known_is_dynamic_callstack:
+; OBJDUMP: 0030 00000000 00000000 00001000 00000000
+
+; OBJDUMP: Contents of section .known_wavefront_sgpr_count:
+; OBJDUMP: 0050 00000000 01000000 00000000 00000000
+
+; OBJDUMP: Contents of section .known_workitem_vgpr_count:
+; OBJDUMP: 0050 00000000 00000100 00000000 00000000
+
+; OBJDUMP: Contents of section .known_workitem_private_segment_byte_size:
+; OBJDUMP: 0030 00000000 00000000 00000000 01000000
+
+; OBJDUMP: Contents of section .known_granulated_workitem_vgpr_count:
+; OBJDUMP: 0030 01000000 00000000 00000000 00000000
+
+; OBJDUMP: Contents of section .known_enable_sgpr_workgroup_id_x:
+; OBJDUMP: 0030 00000000 80000000 00000000 00000000
+
+; OBJDUMP: Contents of section .unknown_is_dynamic_callstack:
+; OBJDUMP: 0030 00000000 00000000 00001000 00000000
+
+; OBJDUMP: Contents of section .unknown_wavefront_sgpr_count:
+; OBJDUMP: 0050 00000000 01000000 00000000 00000000
+
+; OBJDUMP: Contents of section .unknown_workitem_vgpr_count:
+; OBJDUMP: 0050 00000000 00000100 00000000 00000000
+
+; OBJDUMP: Contents of section .unknown_workitem_private_segment_byte_size:
+; OBJDUMP: 0030 00000000 00000000 00000000 01000000
+
+; OBJDUMP: Contents of section .unknown_granulated_workitem_vgpr_count:
+; OBJDUMP: 0030 01000000 00000000 00000000 00000000
+
+; OBJDUMP: Contents of section .unknown_enable_sgpr_workgroup_id_x:
+; OBJDUMP: 0030 00000000 80000000 00000000 00000000
+
+.set known, 1
+
+; ASM-LABEL: known_is_dynamic_callstack:
+; ASM: is_dynamic_callstack = 1
+.section .known_is_dynamic_callstack
+known_is_dynamic_callstack:
+	.amd_kernel_code_t
+		is_dynamic_callstack = known
+	.end_amd_kernel_code_t
+	s_endpgm
+
+; ASM-LABEL: known_wavefront_sgpr_count:
+; ASM: wavefront_sgpr_count = 1
+.section .known_wavefront_sgpr_count
+known_wavefront_sgpr_count:
+	.amd_kernel_code_t
+		wavefront_sgpr_count = known
+	.end_amd_kernel_code_t
+	s_endpgm
+
+; ASM-LABEL: known_workitem_vgpr_count:
+; ASM: workitem_vgpr_count = 1
+.section .known_workitem_vgpr_count
+known_workitem_vgpr_count:
+	.amd_kernel_code_t
+		workitem_vgpr_count = known
+	.end_amd_kernel_code_t
+	s_endpgm
+
+; ASM-LABEL: known_workitem_private_segment_byte_size:
+; ASM: workitem_private_segment_byte_size = 1
+.section .known_workitem_private_segment_byte_size
+known_workitem_private_segment_byte_size:
+	.amd_kernel_code_t
+		workitem_private_segment_byte_size = known
+	.end_amd_kernel_code_t
+	s_endpgm
+
+; ASM-LABEL: known_granulated_workitem_vgpr_count:
+; ASM: granulated_workitem_vgpr_count = 1
+.section .known_granulated_workitem_vgpr_count
+known_granulated_workitem_vgpr_count:
+	.amd_kernel_code_t
+		granulated_workitem_vgpr_count = known
+	.end_amd_kernel_code_t
+	s_endpgm
+
+; ASM-LABEL: known_enable_sgpr_workgroup_id_x:
+; ASM: enable_sgpr_workgroup_id_x = 1
+.section .known_enable_sgpr_workgroup_id_x
+known_enable_sgpr_workgroup_id_x:
+	.amd_kernel_code_t
+		enable_sgpr_workgroup_id_x = known
+	.end_amd_kernel_code_t
+	s_endpgm
+
+; ASM-LABEL: unknown_is_dynamic_callstack:
+; ASM: is_dynamic_callstack = unknown
+.section .unknown_is_dynamic_callstack
+unknown_is_dynamic_callstack:
+	.amd_kernel_code_t
+		is_dynamic_callstack = unknown
+	.end_amd_kernel_code_t
+	s_endpgm
+
+; ASM-LABEL: unknown_wavefront_sgpr_count:
+; ASM: wavefront_sgpr_count = unknown
+.section .unknown_wavefront_sgpr_count
+unknown_wavefront_sgpr_count:
+	.amd_kernel_code_t
+		wavefront_sgpr_count = unknown
+	.end_amd_kernel_code_t
+	s_endpgm
+
+; ASM-LABEL: unknown_workitem_vgpr_count:
+; ASM: workitem_vgpr_count = unknown
+.section .unknown_workitem_vgpr_count
+unknown_workitem_vgpr_count:
+	.amd_kernel_code_t
+		workitem_vgpr_count = unknown
+	.end_amd_kernel_code_t
+	s_endpgm
+
+; ASM-LABEL: unknown_workitem_private_segment_byte_size:
+; ASM: workitem_private_segment_byte_size = unknown
+.section .unknown_workitem_private_segment_byte_size
+unknown_workitem_private_segment_byte_size:
+	.amd_kernel_code_t
+		workitem_private_segment_byte_size = unknown
+	.end_amd_kernel_code_t
+	s_endpgm
+
+; ASM-LABEL: unknown_granulated_workitem_vgpr_count:
+; ASM: granulated_workitem_vgpr_count = ((0&4294967232)|(unknown&63))&63
+; ASM: granulated_wavefront_sgpr_count = (((0&4294967232)|(unknown&63))>>6)&15
+; ASM: priority = (((0&4294967232)|(unknown&63))>>10)&3
+; ASM: float_mode = (((0&4294967232)|(unknown&63))>>12)&255
+; ASM: priv = (((0&4294967232)|(unknown&63))>>20)&1
+; ASM: enable_dx10_clamp = (((0&4294967232)|(unknown&63))>>21)&1
+; ASM: debug_mode = (((0&4294967232)|(unknown&63))>>22)&1
+; ASM: enable_ieee_mode = (((0&4294967232)|(unknown&63))>>23)&1
+; ASM: enable_wgp_mode = (((0&4294967232)|(unknown&63))>>29)&1
+; ASM: enable_mem_ordered = (((0&4294967232)|(unknown&63))>>30)&1
+; ASM: enable_fwd_progress = (((0&4294967232)|(unknown&63))>>31)&1
+.section .unknown_granulated_workitem_vgpr_count
+unknown_granulated_workitem_vgpr_count:
+	.amd_kernel_code_t
+		granulated_workitem_vgpr_count = unknown
+	.end_amd_kernel_code_t
+	s_endpgm
+
+; ASM-LABEL: unknown_enable_sgpr_workgroup_id_x:
+; ASM: enable_sgpr_private_segment_wave_byte_offset = ((0&4294967167)|((unknown&1)<<7))&1
+; ASM: user_sgpr_count = (((0&4294967167)|((unknown&1)<<7))>>1)&31
+; ASM: enable_trap_handler = (((0&4294967167)|((unknown&1)<<7))>>6)&1
+; ASM: enable_sgpr_workgroup_id_x = (((0&4294967167)|((unknown&1)<<7))>>7)&1
+; ASM: enable_sgpr_workgroup_id_y = (((0&4294967167)|((unknown&1)<<7))>>8)&1
+; ASM: enable_sgpr_workgroup_id_z = (((0&4294967167)|((unknown&1)<<7))>>9)&1
+; ASM: enable_sgpr_workgroup_info = (((0&4294967167)|((unknown&1)<<7))>>10)&1
+; ASM: enable_vgpr_workitem_id = (((0&4294967167)|((unknown&1)<<7))>>11)&3
+; ASM: enable_exception_msb = (((0&4294967167)|((unknown&1)<<7))>>13)&3
+; ASM: granulated_lds_size = (((0&4294967167)|((unknown&1)<<7))>>15)&511
+; ASM: enable_exception = (((0&4294967167)|((unknown&1)<<7))>>24)&127
+.section .unknown_enable_sgpr_workgroup_id_x
+unknown_enable_sgpr_workgroup_id_x:
+	.amd_kernel_code_t
+		enable_sgpr_workgroup_id_x = unknown
+	.end_amd_kernel_code_t
+	s_endpgm
+
+.set unknown, 1
diff --git a/llvm/test/MC/MachO/darwin-target-variant-reverse.ll b/llvm/test/MC/MachO/darwin-target-variant-reverse.ll
index 6d51cd8fffa8..fd527b204546 100644
--- a/llvm/test/MC/MachO/darwin-target-variant-reverse.ll
+++ b/llvm/test/MC/MachO/darwin-target-variant-reverse.ll
@@ -3,7 +3,7 @@
 target triple = "x86_64-apple-ios13.1-macabi";
 !llvm.module.flags = !{!0, !1, !2};
 !0 = !{i32 2, !"SDK Version", [2 x i32] [ i32 13, i32 1 ] };
-!1 = !{i32 1, !"darwin.target_variant.triple", !"x86_64-apple-macos10.15"};
+!1 = !{i32 2, !"darwin.target_variant.triple", !"x86_64-apple-macos10.15"};
 !2 = !{i32 2, !"darwin.target_variant.SDK Version", [2 x i32] [ i32 10, i32 15 ] };
 
 define void @foo() {
diff --git a/llvm/test/MC/MachO/darwin-target-variant.ll b/llvm/test/MC/MachO/darwin-target-variant.ll
index d506ed92c9cc..78bd1e98410f 100644
--- a/llvm/test/MC/MachO/darwin-target-variant.ll
+++ b/llvm/test/MC/MachO/darwin-target-variant.ll
@@ -4,7 +4,7 @@
 target triple = "x86_64-apple-macos10.15";
 !llvm.module.flags = !{!0, !1, !2};
 !0 = !{i32 2, !"SDK Version", [3 x i32] [ i32 10, i32 15, i32 1 ] };
-!1 = !{i32 1, !"darwin.target_variant.triple", !"x86_64-apple-ios13.1-macabi"};
+!1 = !{i32 2, !"darwin.target_variant.triple", !"x86_64-apple-ios13.1-macabi"};
 !2 = !{i32 2, !"darwin.target_variant.SDK Version", [2 x i32] [ i32 13, i32 2 ] };
 
 define void @foo() {
diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s
index a028d4025ec1..0e5eddd83e40 100644
--- a/llvm/test/MC/RISCV/attribute-arch.s
+++ b/llvm/test/MC/RISCV/attribute-arch.s
@@ -397,7 +397,7 @@
 # CHECK: attribute      5, "rv32i2p1_xcvbi1p0"
 
 .attribute arch, "rv32i_zicfilp0p4"
-# CHECK: attribute      5, "rv32i2p1_zicfilp0p4"
+# CHECK: attribute      5, "rv32i2p1_zicfilp0p4_zicsr2p0"
 
 .attribute arch, "rv32i_zicfiss0p4"
 # CHECK: .attribute     5, "rv32i2p1_zicfiss0p4_zicsr2p0_zimop1p0"
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index 57fa71e74b8d..d397188a9882 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -845,4 +845,10 @@ main:
     # CHECK: f32.store_f16 32 # encoding: [0xfc,0x31,0x01,0x20]
     f32.store_f16 32
 
+    # CHECK: f16x8.splat # encoding: [0xfd,0xa0,0x02]
+    f16x8.splat
+
+    # CHECK: f16x8.extract_lane 1 # encoding: [0xfd,0xa1,0x02,0x01]
+    f16x8.extract_lane 1
+
     end_function
diff --git a/llvm/test/ThinLTO/X86/import_callee_declaration.ll b/llvm/test/ThinLTO/X86/import_callee_declaration.ll
index 43214e3cf941..246920e5db0d 100644
--- a/llvm/test/ThinLTO/X86/import_callee_declaration.ll
+++ b/llvm/test/ThinLTO/X86/import_callee_declaration.ll
@@ -15,16 +15,20 @@
 ;   and the other one is larger. Both callees of 'small_func' are defined in lib.ll.
 ; - Given the import limit, in main's combined summary, the import type of 'small_func'
 ;   and 'small_indirect_callee' will be 'definition', and the import type of
-;   'large_func' and 'large_indirect_callee' will be 'declaration'.
+;   large* functions and their aliasees will be 'declaration'.
 ;
 ; The test will disassemble combined summaries and check the import type is
 ; correct. Right now postlink optimizer pipeline doesn't do anything (e.g.,
 ; import the declaration or de-serialize summary attributes yet) so there is
 ; nothing to test more than the summary content.
 ;
+; TODO: Extend this test case to test IR once postlink optimizer makes use of
+; the import type for declarations.
+;
 ; RUN: llvm-lto2 run \
 ; RUN:   -debug-only=function-import \
 ; RUN:   -import-instr-limit=7 \
+; RUN:   -import-instr-evolution-factor=1.0 \
 ; RUN:   -import-declaration \
 ; RUN:   -thinlto-distributed-indexes \
 ; RUN:   -r=main.bc,main,px \
@@ -32,36 +36,45 @@
 ; RUN:   -r=main.bc,large_func, \
 ; RUN:   -r=lib.bc,callee,pl \
 ; RUN:   -r=lib.bc,large_indirect_callee,px \
+; RUN:   -r=lib.bc,large_indirect_bar,px \
 ; RUN:   -r=lib.bc,small_func,px \
 ; RUN:   -r=lib.bc,large_func,px \
 ; RUN:   -r=lib.bc,large_indirect_callee_alias,px \
-; RUN:   -r=lib.bc,calleeAddrs,px -o summary main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN:   -r=lib.bc,large_indirect_bar_alias,px \
+; RUN:   -r=lib.bc,calleeAddrs,px -r=lib.bc,calleeAddrs2,px -o summary main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=DUMP
 ;
-; RUN: llvm-lto -thinlto-action=thinlink -import-declaration -import-instr-limit=7  -o combined.index.bc main.bc lib.bc
-; RUN: llvm-lto -thinlto-action=distributedindexes -debug-only=function-import -import-declaration -import-instr-limit=7 -thinlto-index combined.index.bc main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN: llvm-lto -thinlto-action=thinlink -import-declaration -import-instr-limit=7 -import-instr-evolution-factor=1.0 -o combined.index.bc main.bc lib.bc
+; RUN: llvm-lto -thinlto-action=distributedindexes -debug-only=function-import -import-declaration -import-instr-limit=7 -import-instr-evolution-factor=1.0 -thinlto-index combined.index.bc main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=DUMP
 
-; DUMP: - 2 function definitions and 3 function declarations imported from lib.bc
+; DUMP: - 2 function definitions and 4 function declarations imported from lib.bc
 
 ; First disassemble per-module summary and find out the GUID for {large_func, large_indirect_callee}.
 ;
 ; RUN: llvm-dis lib.bc -o - | FileCheck %s --check-prefix=LIB-DIS
-; LIB-DIS: [[LARGEFUNC:\^[0-9]+]] = gv: (name: "large_func", summaries: {{.*}}) ; guid = 2418497564662708935
+; LIB-DIS: module: (path: "lib.bc", hash: (0, 0, 0, 0, 0))
+; LIB-DIS: gv: (name: "large_func", summaries: {{.*}}) ; guid = 2418497564662708935
+; LIB-DIS: gv: (name: "large_indirect_bar_alias", summaries: {{.*}}, aliasee: [[LARGEINDIRECT_BAR:\^[0-9]+]]{{.*}}guid = 13590951773474913315
+; LIB-DIS: [[LARGEINDIRECT_BAR]] = gv: (name: "large_indirect_bar", summaries: {{.*}}) ; guid = 13770917885399536773
 ; LIB-DIS: [[LARGEINDIRECT:\^[0-9]+]] = gv: (name: "large_indirect_callee", summaries: {{.*}}) ; guid = 14343440786664691134
-; LIB-DIS: [[LARGEINDIRECTALIAS:\^[0-9]+]] = gv: (name: "large_indirect_callee_alias", summaries: {{.*}}, aliasee: [[LARGEINDIRECT]]
+; LIB-DIS: gv: (name: "large_indirect_callee_alias", summaries: {{.*}}, aliasee: [[LARGEINDIRECT]]{{.*}}guid = 16730173943625350469
 ;
-; Secondly disassemble main's combined summary and test that large callees are
-; not imported as declarations yet.
+; Secondly disassemble main's combined summary and verify the import type of
+; these two GUIDs are declaration.
 ;
 ; RUN: llvm-dis main.bc.thinlto.bc -o - | FileCheck %s --check-prefix=MAIN-DIS
 ;
 ; MAIN-DIS: [[LIBMOD:\^[0-9]+]] = module: (path: "lib.bc", hash: (0, 0, 0, 0, 0))
-; MAIN-DIS-NOT: [[LARGEFUNC:\^[0-9]+]] = gv: (guid: 2418497564662708935, summaries: (function: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), insts: 8, {{.*}})))
-; MAIN-DIS-NOT: [[LARGEINDIRECT:\^[0-9]+]] = gv: (guid: 14343440786664691134, summaries: (function: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), insts: 8, {{.*}})))
-; MAIN-DIS-NOT: [[LARGEINDIRECTALIAS:\^[0-9]+]] = gv: (guid: 16730173943625350469, summaries: (alias: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration)
+; MAIN-DIS: gv: (guid: 2418497564662708935, summaries: (function: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), insts: 8, {{.*}})))
+; When alias is imported as a copy of the aliasee, but the aliasee is not being
+; imported by itself, the aliasee should be null.
+; MAIN-DIS: gv: (guid: 13590951773474913315, summaries: (alias: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), aliasee: null)))
+; MAIN-DIS: [[LARGEINDIRECT:\^[0-9]+]] = gv: (guid: 14343440786664691134, summaries: (function: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), insts: 8, {{.*}})))
+; MAIN-DIS: gv: (guid: 16730173943625350469, summaries: (alias: (module: [[LIBMOD]], flags: ({{.*}} importType: declaration), aliasee: [[LARGEINDIRECT]])))
 
 ; Run in-process ThinLTO and tests that
 ; 1. `callee` remains internalized even if the symbols of its callers
-; (large_func and large_indirect_callee) are exported as declarations and visible to main module.
+; (large_func, large_indirect_callee, large_indirect_bar) are exported as
+; declarations and visible to main module.
 ; 2. the debugging logs from `function-import` pass are expected.
 
 ; RUN: llvm-lto2 run \
@@ -69,20 +82,21 @@
 ; RUN:   -save-temps \
 ; RUN:   -thinlto-threads=1 \
 ; RUN:   -import-instr-limit=7 \
+; RUN:   -import-instr-evolution-factor=1.0 \
 ; RUN:   -import-declaration \
 ; RUN:   -r=main.bc,main,px \
 ; RUN:   -r=main.bc,small_func, \
 ; RUN:   -r=main.bc,large_func, \
 ; RUN:   -r=lib.bc,callee,pl \
 ; RUN:   -r=lib.bc,large_indirect_callee,px \
+; RUN:   -r=lib.bc,large_indirect_bar,px \
 ; RUN:   -r=lib.bc,small_func,px \
 ; RUN:   -r=lib.bc,large_func,px \
 ; RUN:   -r=lib.bc,large_indirect_callee_alias,px \
-; RUN:   -r=lib.bc,calleeAddrs,px -o in-process main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=IMPORTDUMP
+; RUN:   -r=lib.bc,large_indirect_bar_alias,px \
+; RUN:   -r=lib.bc,calleeAddrs,px -r=lib.bc,calleeAddrs2,px -o in-process main.bc lib.bc 2>&1 | FileCheck %s --check-prefix=IMPORTDUMP
 
-; Test import status from debugging logs.
-; TODO: Serialize declaration bit and test declaration bits are correctly set,
-; and extend this test case to test IR once postlink optimizer makes use of
+; TODO: Extend this test case to test IR once postlink optimizer makes use of
 ; the import type for declarations.
 ; IMPORTDUMP-DAG: Not importing function 11825436545918268459 callee from lib.cc
 ; IMPORTDUMP-DAG: Is importing function declaration 14343440786664691134 large_indirect_callee from lib.cc
@@ -91,6 +105,8 @@
 ; IMPORTDUMP-DAG: Is importing function declaration 2418497564662708935 large_func from lib.cc
 ; IMPORTDUMP-DAG: Not importing global 7680325410415171624 calleeAddrs from lib.cc
 ; IMPORTDUMP-DAG: Is importing alias declaration 16730173943625350469 large_indirect_callee_alias from lib.cc
+; IMPORTDUMP-DAG: Is importing alias declaration 13590951773474913315 large_indirect_bar_alias from lib.cc
+; IMPORTDUMP-DAG: Not importing function 13770917885399536773 large_indirect_bar
 
 ; RUN: llvm-dis in-process.1.3.import.bc -o - | FileCheck %s --check-prefix=IMPORT
 
@@ -101,6 +117,8 @@
 ; IMPORT-DAG: declare void @large_func
 ; IMPORT-NOT: large_indirect_callee
 ; IMPORT-NOT: large_indirect_callee_alias
+; IMPORT-NOT: large_indirect_bar
+; IMPORT-NOT: large_indirect_bar_alias
 
 ; INTERNALIZE: define internal void @callee()
 
@@ -124,8 +142,13 @@ source_filename = "lib.cc"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
+; Both large_indirect_callee and large_indirect_callee_alias are referenced
+; and visible to main.ll.
 @calleeAddrs = global [3 x ptr] [ptr @large_indirect_callee, ptr @small_indirect_callee, ptr @large_indirect_callee_alias]
 
+; large_indirect_bar_alias is visible to main.ll but its aliasee isn't.
+@calleeAddrs2 = global [1 x ptr] [ptr @large_indirect_bar_alias]
+
 define void @callee() #1 {
   ret void
 }
@@ -141,12 +164,28 @@ define void @large_indirect_callee()#2 {
   ret void
 }
 
+define void @large_indirect_bar()#2 {
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  call void @callee()
+  ret void
+}
+
 define internal void @small_indirect_callee() #0 {
+entry:
+  %0 = load ptr, ptr @calleeAddrs2
+  call void %0(), !prof !3
   ret void
 }
 
 @large_indirect_callee_alias = alias void(), ptr @large_indirect_callee
 
+@large_indirect_bar_alias = alias void(), ptr @large_indirect_bar
+
 define void @small_func() {
 entry:
   %0 = load ptr, ptr @calleeAddrs
@@ -179,3 +218,4 @@ attributes #2 = { norecurse }
 !0 = !{!"VP", i32 0, i64 1, i64 14343440786664691134, i64 1}
 !1 = !{!"VP", i32 0, i64 1, i64 13568239288960714650, i64 1}
 !2 = !{!"VP", i32 0, i64 1, i64 16730173943625350469, i64 1}
+!3 = !{!"VP", i32 0, i64 1, i64 13590951773474913315, i64 1}
diff --git a/llvm/test/ThinLTO/X86/memprof-tailcall-nonunique.ll b/llvm/test/ThinLTO/X86/memprof-tailcall-nonunique.ll
index d7cfafec89fe..49c22bf590e6 100644
--- a/llvm/test/ThinLTO/X86/memprof-tailcall-nonunique.ll
+++ b/llvm/test/ThinLTO/X86/memprof-tailcall-nonunique.ll
@@ -14,10 +14,11 @@
 ; RUN:  -r=%t.o,_Z4baz1v,plx \
 ; RUN:  -r=%t.o,_Z4baz2v,plx \
 ; RUN:  -r=%t.o,_Z3foob,plx \
+; RUN:  -r=%t.o,xyz,plx \
 ; RUN:  -r=%t.o,main,plx \
 ; RUN:  -r=%t.o,_Znam, \
 ; RUN:  -stats -debug -save-temps \
-; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=STATS
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=STATS --check-prefix=DEBUG
 
 ; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
 
@@ -31,22 +32,20 @@
 ; RUN:  -r=%t.o,_Z4baz1v,plx \
 ; RUN:  -r=%t.o,_Z4baz2v,plx \
 ; RUN:  -r=%t.o,_Z3foob,plx \
+; RUN:  -r=%t.o,xyz,plx \
 ; RUN:  -r=%t.o,main,plx \
 ; RUN:  -r=%t.o,_Znam, \
 ; RUN:  -stats -debug \
-; RUN:  -o %t2.out 2>&1 | FileCheck %s --check-prefix=STATS
+; RUN:  -o %t2.out 2>&1 | FileCheck %s --check-prefix=STATS --check-prefix=DEBUG
 
 ;; Run ThinLTO backend
 ; RUN: opt -passes=memprof-context-disambiguation \
 ; RUN:  -memprof-import-summary=%t.o.thinlto.bc \
 ; RUN:  -stats %t.o -S 2>&1 | FileCheck %s --check-prefix=IR
 
-; DEBUG: Not found through unique tail call chain: _Z3barv from main that actually called _Z3foob (found multiple possible chains)
-; DEBUG: Not found through unique tail call chain: _Z3barv from main that actually called _Z3foob (found multiple possible chains)
-; DEBUG: Not found through unique tail call chain: _Z3barv from main that actually called _Z3foob (found multiple possible chains)
-; DEBUG: Not found through unique tail call chain: _Z3barv from main that actually called _Z3foob (found multiple possible chains)
+; DEBUG: Not found through unique tail call chain: 17377440600225628772 (_Z3barv) from 15822663052811949562 (main) that actually called 8716735811002003409 (xyz) (found multiple possible chains)
 
-; STATS: 4 memprof-context-disambiguation - Number of profiled callees found via multiple tail call chains
+; STATS: 1 memprof-context-disambiguation - Number of profiled callees found via multiple tail call chains
 
 ;; Check that all calls in the IR are to the original functions, leading to a
 ;; non-cold operator new call.
@@ -125,17 +124,24 @@ return:                                           ; preds = %if.else, %if.then
 }
 
 ; Function Attrs: noinline
-; IR-LABEL: @main()
-define dso_local i32 @main() local_unnamed_addr #0 {
+; IR-LABEL: @xyz()
+define dso_local i32 @xyz() local_unnamed_addr #0 {
 delete.end13:
   ; IR: call ptr @_Z3foob(i1 true)
-  %call = tail call ptr @_Z3foob(i1 true), !callsite !10
+  %call = tail call ptr @_Z3foob(i1 true)
   ; IR: call ptr @_Z3foob(i1 true)
-  %call1 = tail call ptr @_Z3foob(i1 true), !callsite !11
+  %call1 = tail call ptr @_Z3foob(i1 true)
   ; IR: call ptr @_Z3foob(i1 false)
-  %call2 = tail call ptr @_Z3foob(i1 false), !callsite !12
+  %call2 = tail call ptr @_Z3foob(i1 false)
   ; IR: call ptr @_Z3foob(i1 false)
-  %call3 = tail call ptr @_Z3foob(i1 false), !callsite !13
+  %call3 = tail call ptr @_Z3foob(i1 false)
+  ret i32 0
+}
+
+define dso_local i32 @main() local_unnamed_addr #0 {
+delete.end13:
+  ; IR: call i32 @xyz()
+  %call1 = tail call i32 @xyz(), !callsite !11
   ret i32 0
 }
 
@@ -145,17 +151,10 @@ attributes #0 = { noinline }
 attributes #1 = { nobuiltin allocsize(0) }
 attributes #2 = { builtin allocsize(0) }
 
-!0 = !{!1, !3, !5, !7}
-!1 = !{!2, !"notcold"}
-!2 = !{i64 3186456655321080972, i64 6307901912192269588}
-!3 = !{!4, !"cold"}
-!4 = !{i64 3186456655321080972, i64 6792096022461663180}
+!0 = !{!5, !7}
 !5 = !{!6, !"notcold"}
 !6 = !{i64 3186456655321080972, i64 8632435727821051414}
 !7 = !{!8, !"cold"}
 !8 = !{i64 3186456655321080972, i64 -3421689549917153178}
 !9 = !{i64 3186456655321080972}
-!10 = !{i64 8632435727821051414}
 !11 = !{i64 -3421689549917153178}
-!12 = !{i64 6307901912192269588}
-!13 = !{i64 6792096022461663180}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
index ce8524c70af6..0acb8f8d0fcf 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
@@ -176,9 +176,9 @@ define i16 @test_atomicrmw_and_i16_global_agent_align4(ptr addrspace(1) %ptr, i1
   ret i16 %res
 }
 
-; Preserve unknown metadata
-define i16 @test_atomicrmw_and_i16_global_agent_preserve_md(ptr addrspace(1) %ptr, i16 %value) {
-; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_preserve_md(
+; Drop unknown metadata and noundef
+define i16 @test_atomicrmw_and_i16_global_agent_drop_md(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_drop_md(
 ; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
 ; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
@@ -198,9 +198,9 @@ define i16 @test_atomicrmw_and_i16_global_agent_preserve_md(ptr addrspace(1) %pt
   ret i16 %res
 }
 
-; Preserve unknown metadata
-define i16 @test_atomicrmw_and_i16_global_agent_align4_preserve_md(ptr addrspace(1) %ptr, i16 %value) {
-; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_md(
+; Drop unknown metadata
+define i16 @test_atomicrmw_and_i16_global_agent_align4_drop_md(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4_drop_md(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
 ; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536
 ; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
@@ -211,6 +211,89 @@ define i16 @test_atomicrmw_and_i16_global_agent_align4_preserve_md(ptr addrspace
   ret i16 %res
 }
 
+; Drop noundef, preserve mmra
+define i16 @test_atomicrmw_and_i16_global_agent_preserve_mmra(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_preserve_mmra(
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]]
+; CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !mmra [[META0:![0-9]+]]
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, !noundef !0, !mmra !1
+  ret i16 %res
+}
+
+; Drop noundef, preserve mmra
+define i16 @test_atomicrmw_and_i16_global_agent_align4_preserve_mmra(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_mmra(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536
+; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !mmra [[META0]]
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4, !noundef !0, !mmra !1
+  ret i16 %res
+}
+
+define i16 @test_atomicrmw_and_i16_global_agent_align4_preserve_alias_scope(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_alias_scope(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536
+; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !alias.scope [[META1:![0-9]+]]
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4, !alias.scope !2
+  ret i16 %res
+}
+
+define i16 @test_atomicrmw_and_i16_global_agent_align4_preserve_noalias(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_noalias(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536
+; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !noalias [[META1]]
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4, !noalias !2
+  ret i16 %res
+}
+
+define i16 @test_atomicrmw_and_i16_global_agent_align4_preserve_tbaa_struct(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_tbaa_struct(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536
+; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !tbaa.struct [[TBAA_STRUCT4:![0-9]+]]
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4, !tbaa.struct !5
+  ret i16 %res
+}
+
+define i16 @test_atomicrmw_and_i16_global_agent_align4_preserve_tbaa(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4_preserve_tbaa(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
+; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536
+; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-NEXT:    ret i16 [[EXTRACTED]]
+;
+  %res = atomicrmw and ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4, !tbaa !6
+  ret i16 %res
+}
+
 define i16 @test_atomicrmw_and_i16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i16 %value) {
 ; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent__amdgpu_no_remote_memory(
 ; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
@@ -223,7 +306,7 @@ define i16 @test_atomicrmw_and_i16_global_agent__amdgpu_no_remote_memory(ptr add
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32
 ; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]]
-; CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META8:![0-9]+]]
 ; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; CHECK-NEXT:    ret i16 [[EXTRACTED]]
@@ -236,7 +319,7 @@ define i16 @test_atomicrmw_and_i16_global_agent_align4__amdgpu_no_remote_memory(
 ; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4__amdgpu_no_remote_memory(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
 ; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536
-; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META8]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16
 ; CHECK-NEXT:    ret i16 [[EXTRACTED]]
 ;
@@ -256,7 +339,7 @@ define i16 @test_atomicrmw_and_i16_global_agent__amdgpu_no_fine_grained_memory(p
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32
 ; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]]
-; CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META8]]
 ; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; CHECK-NEXT:    ret i16 [[EXTRACTED]]
@@ -269,7 +352,7 @@ define i16 @test_atomicrmw_and_i16_global_agent_align4__amdgpu_no_fine_grained_m
 ; CHECK-LABEL: @test_atomicrmw_and_i16_global_agent_align4__amdgpu_no_fine_grained_memory(
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[VALUE:%.*]] to i32
 ; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[TMP1]], -65536
-; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw and ptr addrspace(1) [[PTR:%.*]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META8]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[TMP2]] to i16
 ; CHECK-NEXT:    ret i16 [[EXTRACTED]]
 ;
@@ -1180,6 +1263,15 @@ define bfloat @test_atomicrmw_xchg_bf16_global_agent_align4(ptr addrspace(1) %pt
 }
 
 !0 = !{}
+!1 = !{!"foo", !"bar"}
+!2 = !{!3}
+!3 = distinct !{!3, !4}
+!4 = distinct !{!4}
+!5 = !{i64 0, i64 4, !1, i64 8, i64 4}
+!6 = !{!7, !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C/C++ TBAA"}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; BASE: {{.*}}
 ; GCN: {{.*}}
diff --git a/llvm/test/Transforms/Attributor/issue87856.ll b/llvm/test/Transforms/Attributor/issue87856.ll
new file mode 100644
index 000000000000..4da29cc4448d
--- /dev/null
+++ b/llvm/test/Transforms/Attributor/issue87856.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4
+; RUN: opt -S -passes=attributor < %s | FileCheck %s
+
+define void @null_ptr_is_valid_call_with_null() #0 {
+; CHECK-LABEL: define void @null_ptr_is_valid_call_with_null(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    call void @store_as0(ptr nofree noundef writeonly align 4294967296 null) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  call void @store_as0(ptr null)
+  ret void
+}
+
+define void @null_ptr_is_valid_call_with_undef() #0 {
+; CHECK-LABEL: define void @null_ptr_is_valid_call_with_undef(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    call void @store_as0(ptr undef) #[[ATTR4]]
+; CHECK-NEXT:    ret void
+;
+  call void @store_as0(ptr undef)
+  ret void
+}
+
+define void @store_as0(ptr %0) {
+; CHECK-LABEL: define void @store_as0(
+; CHECK-SAME: ptr nocapture nofree noundef nonnull writeonly align 2 dereferenceable(2) [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    store i16 0, ptr [[TMP0]], align 2
+; CHECK-NEXT:    ret void
+;
+  store i16 0, ptr %0, align 2
+  ret void
+}
+
+define void @call_store_as1() {
+; CHECK-LABEL: define void @call_store_as1(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT:    call void @store_as1(ptr addrspace(1) nocapture nofree noundef writeonly align 4294967296 null) #[[ATTR4]]
+; CHECK-NEXT:    ret void
+;
+  call void @store_as1(ptr addrspace(1) null)
+  ret void
+}
+
+define void @store_as1(ptr addrspace(1) %arg) {
+; CHECK-LABEL: define void @store_as1(
+; CHECK-SAME: ptr addrspace(1) nocapture nofree noundef writeonly align 2 dereferenceable_or_null(2) [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    store i16 0, ptr addrspace(1) [[ARG]], align 2
+; CHECK-NEXT:    ret void
+;
+  store i16 0, ptr addrspace(1) %arg, align 2
+  ret void
+}
+
+attributes #0 = { null_pointer_is_valid }
+;.
+; CHECK: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(write) }
+; CHECK: attributes #[[ATTR1]] = { mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) }
+; CHECK: attributes #[[ATTR2]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) }
+; CHECK: attributes #[[ATTR3]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR4]] = { nofree nosync nounwind willreturn memory(write) }
+;.
diff --git a/llvm/test/Transforms/ConstraintElimination/sext-unsigned-predicates.ll b/llvm/test/Transforms/ConstraintElimination/sext-unsigned-predicates.ll
index ac3e57768ae5..00dc48ef89c9 100644
--- a/llvm/test/Transforms/ConstraintElimination/sext-unsigned-predicates.ll
+++ b/llvm/test/Transforms/ConstraintElimination/sext-unsigned-predicates.ll
@@ -13,7 +13,8 @@ define void @uge_sext(i16 %x, i32 %y) {
 ; CHECK-NEXT:    [[AND:%.*]] = and i1 [[C_1]], [[C_2]]
 ; CHECK-NEXT:    br i1 [[AND]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    call void @use(i1 true)
+; CHECK-NEXT:    [[T_1:%.*]] = icmp uge i32 [[X_EXT]], [[Y]]
+; CHECK-NEXT:    call void @use(i1 [[T_1]])
 ; CHECK-NEXT:    [[C_3:%.*]] = icmp uge i16 [[X]], -10
 ; CHECK-NEXT:    call void @use(i1 [[C_3]])
 ; CHECK-NEXT:    [[C_4:%.*]] = icmp uge i32 [[X_EXT]], -9
@@ -65,8 +66,7 @@ define void @uge_sext_known_positive(i16 %x, i32 %y) {
 ; CHECK-NEXT:    br i1 [[AND]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    call void @use(i1 true)
-; CHECK-NEXT:    [[T_2:%.*]] = icmp uge i16 [[X]], 10
-; CHECK-NEXT:    call void @use(i1 [[T_2]])
+; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    [[C_3:%.*]] = icmp uge i32 [[X_EXT]], 11
 ; CHECK-NEXT:    call void @use(i1 [[C_3]])
 ; CHECK-NEXT:    [[C_4:%.*]] = icmp uge i32 [[X_EXT]], 11
diff --git a/llvm/test/Transforms/ConstraintElimination/transfer-signed-facts-to-unsigned.ll b/llvm/test/Transforms/ConstraintElimination/transfer-signed-facts-to-unsigned.ll
index 2fe92628dfa3..68e48c7d2944 100644
--- a/llvm/test/Transforms/ConstraintElimination/transfer-signed-facts-to-unsigned.ll
+++ b/llvm/test/Transforms/ConstraintElimination/transfer-signed-facts-to-unsigned.ll
@@ -503,11 +503,9 @@ define i32 @sge_2_gep(i32 %idx, ptr %src, i32 %idx.2) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[IDX]], 2
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
 ; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IDX_2:%.*]]
-; CHECK-NEXT:    [[T_1:%.*]] = icmp ult ptr [[SRC]], [[ADD_PTR]]
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ult ptr [[SRC]], [[ADD_PTR_2]]
-; CHECK-NEXT:    [[X_1:%.*]] = xor i1 [[T_1]], [[C_1]]
-; CHECK-NEXT:    [[F_1:%.*]] = icmp uge ptr [[SRC]], [[ADD_PTR]]
-; CHECK-NEXT:    [[X_2:%.*]] = xor i1 [[X_1]], [[F_1]]
+; CHECK-NEXT:    [[X_1:%.*]] = xor i1 true, [[C_1]]
+; CHECK-NEXT:    [[X_2:%.*]] = xor i1 [[X_1]], false
 ; CHECK-NEXT:    br i1 [[X_2]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    ret i32 0
diff --git a/llvm/test/Transforms/Coroutines/coro-await-suspend-handle-in-ramp.ll b/llvm/test/Transforms/Coroutines/coro-await-suspend-handle-in-ramp.ll
new file mode 100644
index 000000000000..ee64ce6e4482
--- /dev/null
+++ b/llvm/test/Transforms/Coroutines/coro-await-suspend-handle-in-ramp.ll
@@ -0,0 +1,59 @@
+; Tests lowerings of different versions of coro.await.suspend
+; RUN: opt < %s -passes='module(coro-early),cgscc(coro-split),simplifycfg' -S | FileCheck %s
+
+%Awaiter = type {}
+
+define void @f() presplitcoroutine {
+entry:
+  %awaiter = alloca %Awaiter
+  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
+  %size = call i32 @llvm.coro.size.i32()
+  %alloc = call ptr @malloc(i32 %size)
+  %hdl = call ptr @llvm.coro.begin(token %id, ptr %alloc)
+  call void @llvm.coro.await.suspend.handle(ptr %awaiter, ptr %hdl, ptr @await_suspend_wrapper_handle)
+  %suspend.init = call i8 @llvm.coro.suspend(token none, i1 false)
+  switch i8 %suspend.init, label %ret [
+    i8 0, label %step
+    i8 1, label %cleanup
+  ]
+
+; Check the calling convention for resuming function is fastcc
+; CHECK:     define {{[^@]*}} @f()
+; CHECK:      entry:
+; CHECK:        %[[NEXT_HDL:.+]] = call ptr @await_suspend_wrapper_handle(
+; CHECK-NEXT:   %[[CONT:.+]] = call ptr @llvm.coro.subfn.addr(ptr %[[NEXT_HDL]], i8 0)
+; CHECK-NEXT:   call fastcc void %[[CONT]](ptr %[[NEXT_HDL]])
+step:
+  br label %cleanup
+
+cleanup:
+  %mem = call ptr @llvm.coro.free(token %id, ptr %hdl)
+  call void @free(ptr %mem)
+  br label %ret
+
+ret:
+  call i1 @llvm.coro.end(ptr %hdl, i1 0, token none)
+  ret void
+}
+
+; check that we were haven't accidentally went out of @f body
+; CHECK-LABEL: @f.resume(
+; CHECK-LABEL: @f.destroy(
+; CHECK-LABEL: @f.cleanup(
+
+declare ptr @await_suspend_wrapper_handle(ptr, ptr)
+
+declare ptr @llvm.coro.free(token, ptr)
+declare i32 @llvm.coro.size.i32()
+declare i8  @llvm.coro.suspend(token, i1)
+declare void @llvm.coro.resume(ptr)
+declare void @llvm.coro.destroy(ptr)
+
+declare token @llvm.coro.id(i32, ptr, ptr, ptr)
+declare i1 @llvm.coro.alloc(token)
+declare ptr @llvm.coro.begin(token, ptr)
+declare void @llvm.coro.await.suspend.handle(ptr, ptr, ptr)
+declare i1 @llvm.coro.end(ptr, i1, token)
+
+declare noalias ptr @malloc(i32)
+declare void @free(ptr)
diff --git a/llvm/test/Transforms/Coroutines/coro-debug-frame-variable-O1.ll b/llvm/test/Transforms/Coroutines/coro-debug-frame-variable-inlined.ll
index acd6a08d7c1b..ff070d9b02ac 100644
--- a/llvm/test/Transforms/Coroutines/coro-debug-frame-variable-O1.ll
+++ b/llvm/test/Transforms/Coroutines/coro-debug-frame-variable-inlined.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -passes='module(coro-early),cgscc(inline,coro-split<reuse-storage>)' -S | FileCheck %s
-; RUN: opt --try-experimental-debuginfo-iterators < %s -passes='module(coro-early),cgscc(inline,coro-split<reuse-storage>)' -S | FileCheck %s
+; RUN: opt < %s -passes='module(coro-early),cgscc(inline,coro-split)' -S | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators < %s -passes='module(coro-early),cgscc(inline,coro-split)' -S | FileCheck %s
 
 ; Simplified version from pr#75104.
 ; Make sure we do not update debug location for hosited dbg.declare intrinsics when optimizing coro frame.
diff --git a/llvm/test/Transforms/Coroutines/coro-lifetime-end.ll b/llvm/test/Transforms/Coroutines/coro-lifetime-end.ll
new file mode 100644
index 000000000000..330c61360e20
--- /dev/null
+++ b/llvm/test/Transforms/Coroutines/coro-lifetime-end.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes='cgscc(coro-split),simplifycfg,early-cse' -S | FileCheck %s
+
+declare ptr @malloc(i64)
+
+%i8.array = type { [100 x i8] }
+declare void @consume.i8.array(ptr)
+
+@testbool = external local_unnamed_addr global i8, align 1
+
+; testval does not contain an explicit lifetime end. We must assume that it may
+; live across suspension.
+define void @HasNoLifetimeEnd() presplitcoroutine {
+; CHECK-LABEL: define void @HasNoLifetimeEnd() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ID:%.*]] = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr @HasNoLifetimeEnd.resumers)
+; CHECK-NEXT:    [[ALLOC:%.*]] = call ptr @malloc(i64 16)
+; CHECK-NEXT:    [[VFRAME:%.*]] = call noalias nonnull ptr @llvm.coro.begin(token [[ID]], ptr [[ALLOC]])
+; CHECK-NEXT:    store ptr @HasNoLifetimeEnd.resume, ptr [[VFRAME]], align 8
+; CHECK-NEXT:    [[DESTROY_ADDR:%.*]] = getelementptr inbounds [[HASNOLIFETIMEEND_FRAME:%.*]], ptr [[VFRAME]], i32 0, i32 1
+; CHECK-NEXT:    store ptr @HasNoLifetimeEnd.destroy, ptr [[DESTROY_ADDR]], align 8
+; CHECK-NEXT:    [[INDEX_ADDR1:%.*]] = getelementptr inbounds [[HASNOLIFETIMEEND_FRAME]], ptr [[VFRAME]], i32 0, i32 2
+; CHECK-NEXT:    call void @consume.i8.array(ptr [[INDEX_ADDR1]])
+; CHECK-NEXT:    [[INDEX_ADDR2:%.*]] = getelementptr inbounds [[HASNOLIFETIMEEND_FRAME]], ptr [[VFRAME]], i32 0, i32 3
+; CHECK-NEXT:    store i1 false, ptr [[INDEX_ADDR2]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %testval = alloca %i8.array
+  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
+  %alloc = call ptr @malloc(i64 16) #3
+  %vFrame = call noalias nonnull ptr @llvm.coro.begin(token %id, ptr %alloc)
+
+  call void @llvm.lifetime.start.p0(i64 100, ptr %testval)
+  call void @consume.i8.array(ptr %testval)
+
+  %save = call token @llvm.coro.save(ptr null)
+  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
+  switch i8 %suspend, label %exit [
+    i8 0, label %await.ready
+    i8 1, label %exit
+  ]
+await.ready:
+  br label %exit
+exit:
+  call i1 @llvm.coro.end(ptr null, i1 false, token none)
+  ret void
+}
+
+define void @LifetimeEndAfterCoroEnd() presplitcoroutine {
+; CHECK-LABEL: define void @LifetimeEndAfterCoroEnd() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ID:%.*]] = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr @LifetimeEndAfterCoroEnd.resumers)
+; CHECK-NEXT:    [[ALLOC:%.*]] = call ptr @malloc(i64 16)
+; CHECK-NEXT:    [[VFRAME:%.*]] = call noalias nonnull ptr @llvm.coro.begin(token [[ID]], ptr [[ALLOC]])
+; CHECK-NEXT:    store ptr @LifetimeEndAfterCoroEnd.resume, ptr [[VFRAME]], align 8
+; CHECK-NEXT:    [[DESTROY_ADDR:%.*]] = getelementptr inbounds [[LIFETIMEENDAFTERCOROEND_FRAME:%.*]], ptr [[VFRAME]], i32 0, i32 1
+; CHECK-NEXT:    store ptr @LifetimeEndAfterCoroEnd.destroy, ptr [[DESTROY_ADDR]], align 8
+; CHECK-NEXT:    [[INDEX_ADDR1:%.*]] = getelementptr inbounds [[LIFETIMEENDAFTERCOROEND_FRAME]], ptr [[VFRAME]], i32 0, i32 2
+; CHECK-NEXT:    call void @consume.i8.array(ptr [[INDEX_ADDR1]])
+; CHECK-NEXT:    [[INDEX_ADDR2:%.*]] = getelementptr inbounds [[LIFETIMEENDAFTERCOROEND_FRAME]], ptr [[VFRAME]], i32 0, i32 3
+; CHECK-NEXT:    store i1 false, ptr [[INDEX_ADDR2]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %testval = alloca %i8.array
+  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
+  %alloc = call ptr @malloc(i64 16) #3
+  %vFrame = call noalias nonnull ptr @llvm.coro.begin(token %id, ptr %alloc)
+
+  call void @llvm.lifetime.start.p0(i64 100, ptr %testval)
+  call void @consume.i8.array(ptr %testval)
+
+  %save = call token @llvm.coro.save(ptr null)
+  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
+  switch i8 %suspend, label %exit [
+    i8 0, label %await.ready
+    i8 1, label %exit
+  ]
+await.ready:
+  br label %exit
+exit:
+  call i1 @llvm.coro.end(ptr null, i1 false, token none)
+  call void @llvm.lifetime.end.p0(i64 100, ptr  %testval)
+  ret void
+}
+
+define void @BranchWithoutLifetimeEnd() presplitcoroutine {
+; CHECK-LABEL: define void @BranchWithoutLifetimeEnd() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ID:%.*]] = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr @BranchWithoutLifetimeEnd.resumers)
+; CHECK-NEXT:    [[ALLOC:%.*]] = call ptr @malloc(i64 16)
+; CHECK-NEXT:    [[VFRAME:%.*]] = call noalias nonnull ptr @llvm.coro.begin(token [[ID]], ptr [[ALLOC]])
+; CHECK-NEXT:    store ptr @BranchWithoutLifetimeEnd.resume, ptr [[VFRAME]], align 8
+; CHECK-NEXT:    [[DESTROY_ADDR:%.*]] = getelementptr inbounds [[BRANCHWITHOUTLIFETIMEEND_FRAME:%.*]], ptr [[VFRAME]], i32 0, i32 1
+; CHECK-NEXT:    store ptr @BranchWithoutLifetimeEnd.destroy, ptr [[DESTROY_ADDR]], align 8
+; CHECK-NEXT:    [[TESTVAL:%.*]] = getelementptr inbounds [[BRANCHWITHOUTLIFETIMEEND_FRAME]], ptr [[VFRAME]], i32 0, i32 2
+; CHECK-NEXT:    call void @consume.i8.array(ptr [[TESTVAL]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr @testbool, align 1
+; CHECK-NEXT:    [[INDEX_ADDR1:%.*]] = getelementptr inbounds [[BRANCHWITHOUTLIFETIMEEND_FRAME]], ptr [[VFRAME]], i32 0, i32 3
+; CHECK-NEXT:    store i1 false, ptr [[INDEX_ADDR1]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %testval = alloca %i8.array
+  %id = call token @llvm.coro.id(i32 0, ptr null, ptr null, ptr null)
+  %alloc = call ptr @malloc(i64 16) #3
+  %vFrame = call noalias nonnull ptr @llvm.coro.begin(token %id, ptr %alloc)
+
+  call void @llvm.lifetime.start.p0(i64 100, ptr %testval)
+  call void @consume.i8.array(ptr %testval)
+
+  %0 = load i8, ptr @testbool, align 1
+  %tobool = trunc nuw i8 %0 to i1
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:
+  call void @llvm.lifetime.end.p0(i64 100, ptr  %testval)
+  br label %if.end
+
+if.end:
+  %save = call token @llvm.coro.save(ptr null)
+  %suspend = call i8 @llvm.coro.suspend(token %save, i1 false)
+  switch i8 %suspend, label %exit [
+    i8 0, label %await.ready
+    i8 1, label %exit
+  ]
+await.ready:
+  br label %exit
+exit:
+  call i1 @llvm.coro.end(ptr null, i1 false, token none)
+  ret void
+}
+
+
+declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr)
+declare ptr @llvm.coro.begin(token, ptr writeonly) #3
+declare ptr @llvm.coro.frame() #5
+declare i8 @llvm.coro.suspend(token, i1) #3
+declare i1 @llvm.coro.end(ptr, i1, token) #3
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #4
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #4
diff --git a/llvm/test/Transforms/Coroutines/no-suspend.ll b/llvm/test/Transforms/Coroutines/no-suspend.ll
index 53eb98f1273a..fd8c5ac99095 100644
--- a/llvm/test/Transforms/Coroutines/no-suspend.ll
+++ b/llvm/test/Transforms/Coroutines/no-suspend.ll
@@ -325,7 +325,7 @@ body:
   %save = call token @llvm.coro.save(ptr %hdl)
   %subfn = call ptr @llvm.coro.subfn.addr(ptr %hdl, i8 1)
   call fastcc void %subfn(ptr %hdl)
-  ; memcpy separates destory from suspend, therefore cannot simplify.
+  ; memcpy separates destroy from suspend, therefore cannot simplify.
   call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 1, i1 false)
   %0 = call i8 @llvm.coro.suspend(token %save, i1 false)
   switch i8 %0, label %suspend [i8 0, label %resume
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/mul.ll b/llvm/test/Transforms/CorrelatedValuePropagation/mul.ll
index b28107ef9d18..086043d4b7c1 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/mul.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/mul.ll
@@ -179,8 +179,7 @@ define i1 @nuw_range1(i8 %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[C:%.*]] = add nuw nsw i8 [[B:%.*]], 1
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nuw i8 [[C]], 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[MUL]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
 entry:
   %c = add nuw nsw i8 %b, 1
@@ -194,8 +193,7 @@ define i1 @nuw_range2(i8 %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[C:%.*]] = add nuw nsw i8 [[B:%.*]], 3
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nuw i8 [[C]], 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[MUL]], 2
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    ret i1 false
 ;
 entry:
   %c = add nuw nsw i8 %b, 3
diff --git a/llvm/test/Transforms/EntryExitInstrumenter/mcount-aix.ll b/llvm/test/Transforms/EntryExitInstrumenter/mcount-aix.ll
new file mode 100644
index 000000000000..82551f012d0b
--- /dev/null
+++ b/llvm/test/Transforms/EntryExitInstrumenter/mcount-aix.ll
@@ -0,0 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="function(ee-instrument),cgscc(inline),function(ee-instrument<post-inline>)" -S < %s | FileCheck %s
+
+target triple = "powerpc-ibm-aix7.2.0.0"
+
+define void @f1() "instrument-function-entry-inlined"="__mcount" {
+; CHECK-LABEL: define void @f1() {
+; CHECK-NEXT:    call void @__mcount(ptr @[[GLOB0:[0-9]+]])
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
diff --git a/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll b/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll
index c444b060d613..bd5f4c2b51a8 100644
--- a/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll
+++ b/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes="function(ee-instrument),cgscc(inline),function(ee-instrument<post-inline>)" -S < %s | FileCheck %s
 
 ; Running the passes twice should not result in more instrumentation.
@@ -7,104 +8,126 @@ target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux"
 
 define void @leaf_function() #0 {
-entry:
+; CHECK-LABEL: define void @leaf_function() {
+; CHECK-NEXT:    call void @mcount()
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__cyg_profile_func_enter(ptr @leaf_function, ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__cyg_profile_func_exit(ptr @leaf_function, ptr [[TMP2]])
+; CHECK-NEXT:    ret void
+;
   ret void
-
-; CHECK-LABEL: define void @leaf_function()
-; CHECK: entry:
-; CHECK-NEXT: call void @mcount()
-; CHECK-NEXT: %0 = call ptr @llvm.returnaddress(i32 0)
-; CHECK-NEXT: call void @__cyg_profile_func_enter(ptr @leaf_function, ptr %0)
-; CHECK-NEXT: %1 = call ptr @llvm.returnaddress(i32 0)
-; CHECK-NEXT: call void @__cyg_profile_func_exit(ptr @leaf_function, ptr %1)
-; CHECK-NEXT: ret void
 }
 
 
 define void @root_function() #0 {
-entry:
+; CHECK-LABEL: define void @root_function() {
+; CHECK-NEXT:    call void @mcount()
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__cyg_profile_func_enter(ptr @root_function, ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__cyg_profile_func_enter(ptr @leaf_function, ptr [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__cyg_profile_func_exit(ptr @leaf_function, ptr [[TMP3]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__cyg_profile_func_exit(ptr @root_function, ptr [[TMP4]])
+; CHECK-NEXT:    ret void
+;
   call void @leaf_function()
   ret void
-
-; CHECK-LABEL: define void @root_function()
-; CHECK: entry:
-; CHECK-NEXT: call void @mcount()
-
-; CHECK-NEXT: %0 = call ptr @llvm.returnaddress(i32 0)
-; CHECK-NEXT: call void @__cyg_profile_func_enter(ptr @root_function, ptr %0)
-
-; Entry and exit calls, inlined from @leaf_function()
-; CHECK-NEXT: %1 = call ptr @llvm.returnaddress(i32 0)
-; CHECK-NEXT: call void @__cyg_profile_func_enter(ptr @leaf_function, ptr %1)
-; CHECK-NEXT: %2 = call ptr @llvm.returnaddress(i32 0)
-; CHECK-NEXT: call void @__cyg_profile_func_exit(ptr @leaf_function, ptr %2)
-; CHECK-NEXT: %3 = call ptr @llvm.returnaddress(i32 0)
-
-; CHECK-NEXT: call void @__cyg_profile_func_exit(ptr @root_function, ptr %3)
-; CHECK-NEXT: ret void
 }
 
-
-
 ; The mcount function has many different names.
 
-define void @f1() #1 { entry: ret void }
-; CHECK-LABEL: define void @f1
-; CHECK: call void @.mcount
-
-define void @f2() #2 { entry: ret void }
-; CHECK-LABEL: define void @f2
-; CHECK: call void @llvm.arm.gnu.eabi.mcount
+define void @f1() #1 {
+; CHECK-LABEL: define void @f1() {
+; CHECK-NEXT:    call void @.mcount()
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
 
-define void @f3() #3 { entry: ret void }
-; CHECK-LABEL: define void @f3
-; CHECK: call void @"\01_mcount"
+define void @f2() #2 {
+; CHECK-LABEL: define void @f2() {
+; CHECK-NEXT:    call void @llvm.arm.gnu.eabi.mcount()
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
 
-define void @f4() #4 { entry: ret void }
-; CHECK-LABEL: define void @f4
-; CHECK: call void @"\01mcount"
+define void @f3() #3 {
+; CHECK-LABEL: define void @f3() {
+; CHECK-NEXT:    call void @"\01_mcount"()
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
 
-define void @f5() #5 { entry: ret void }
-; CHECK-LABEL: define void @f5
-; CHECK: call void @__mcount
+define void @f4() #4 {
+; CHECK-LABEL: define void @f4() {
+; CHECK-NEXT:    call void @"\01mcount"()
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
 
-define void @f6() #6 { entry: ret void }
-; CHECK-LABEL: define void @f6
-; CHECK: call void @_mcount
+define void @f5() #5 {
+; CHECK-LABEL: define void @f5() {
+; CHECK-NEXT:    call void @__mcount()
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
 
-define void @f7() #7 { entry: ret void }
-; CHECK-LABEL: define void @f7
-; CHECK: call void @__cyg_profile_func_enter_bare
+define void @f6() #6 {
+; CHECK-LABEL: define void @f6() {
+; CHECK-NEXT:    call void @_mcount()
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
 
+define void @f7() #7 {
+; CHECK-LABEL: define void @f7() {
+; CHECK-NEXT:    call void @__cyg_profile_func_enter_bare()
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
 
 ; Treat musttail calls as terminators; inserting between the musttail call and
 ; ret is not allowed.
 declare ptr @tailcallee()
 define ptr @tailcaller() #8 {
+; CHECK-LABEL: define ptr @tailcaller() {
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__cyg_profile_func_exit(ptr @tailcaller, ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = musttail call ptr @tailcallee()
+; CHECK-NEXT:    ret ptr [[TMP2]]
+;
   %1 = musttail call ptr @tailcallee()
   ret ptr %1
-; CHECK-LABEL: define ptr @tailcaller
-; CHECK: call void @__cyg_profile_func_exit
-; CHECK: musttail call ptr @tailcallee
-; CHECK: ret
 }
 define ptr @tailcaller2() #8 {
+; CHECK-LABEL: define ptr @tailcaller2() {
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.returnaddress(i32 0)
+; CHECK-NEXT:    call void @__cyg_profile_func_exit(ptr @tailcaller2, ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = musttail call ptr @tailcallee()
+; CHECK-NEXT:    ret ptr [[TMP2]]
+;
   %1 = musttail call ptr @tailcallee()
-  %2 = bitcast ptr %1 to ptr
-  ret ptr %2
-; CHECK-LABEL: define ptr @tailcaller2
-; CHECK: call void @__cyg_profile_func_exit
-; CHECK: musttail call ptr @tailcallee
-; CHECK: bitcast
-; CHECK: ret
+  ret ptr %1
 }
 
 ;; naked functions are not instrumented, otherwise the argument registers
 ;; and the return address register (if present) would be clobbered.
-define void @naked() naked { entry: ret void }
-; CHECK-LABEL:      define void @naked(
-; CHECK-LABEL-NEXT: entry:
-; CHECK-LABEL-NEXT:   ret void
+define void @naked() naked {
+; CHECK-LABEL: define void @naked(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
 
 ; The attributes are "consumed" when the instrumentation is inserted.
 ; CHECK: attributes
diff --git a/llvm/test/Transforms/FunctionAttrs/nocapture.ll b/llvm/test/Transforms/FunctionAttrs/nocapture.ll
index 8d6f6a7c73f8..7df6132ac6a3 100644
--- a/llvm/test/Transforms/FunctionAttrs/nocapture.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nocapture.ll
@@ -163,24 +163,24 @@ define i1 @c6(ptr %q, i8 %bit) personality ptr @__gxx_personality_v0 {
 ; FNATTRS-LABEL: define noundef i1 @c6
 ; FNATTRS-SAME: (ptr readonly [[Q:%.*]], i8 [[BIT:%.*]]) #[[ATTR5:[0-9]+]] personality ptr @__gxx_personality_v0 {
 ; FNATTRS-NEXT:    invoke void @throw_if_bit_set(ptr [[Q]], i8 [[BIT]])
-; FNATTRS-NEXT:    to label [[RET0:%.*]] unwind label [[RET1:%.*]]
+; FNATTRS-NEXT:            to label [[RET0:%.*]] unwind label [[RET1:%.*]]
 ; FNATTRS:       ret0:
 ; FNATTRS-NEXT:    ret i1 false
 ; FNATTRS:       ret1:
 ; FNATTRS-NEXT:    [[EXN:%.*]] = landingpad { ptr, i32 }
-; FNATTRS-NEXT:    cleanup
+; FNATTRS-NEXT:            cleanup
 ; FNATTRS-NEXT:    ret i1 true
 ;
 ; ATTRIBUTOR: Function Attrs: nosync memory(read)
 ; ATTRIBUTOR-LABEL: define i1 @c6
 ; ATTRIBUTOR-SAME: (ptr readonly [[Q:%.*]], i8 [[BIT:%.*]]) #[[ATTR4:[0-9]+]] personality ptr @__gxx_personality_v0 {
 ; ATTRIBUTOR-NEXT:    invoke void @throw_if_bit_set(ptr [[Q]], i8 [[BIT]]) #[[ATTR4]]
-; ATTRIBUTOR-NEXT:    to label [[RET0:%.*]] unwind label [[RET1:%.*]]
+; ATTRIBUTOR-NEXT:            to label [[RET0:%.*]] unwind label [[RET1:%.*]]
 ; ATTRIBUTOR:       ret0:
 ; ATTRIBUTOR-NEXT:    ret i1 false
 ; ATTRIBUTOR:       ret1:
 ; ATTRIBUTOR-NEXT:    [[EXN:%.*]] = landingpad { ptr, i32 }
-; ATTRIBUTOR-NEXT:    cleanup
+; ATTRIBUTOR-NEXT:            cleanup
 ; ATTRIBUTOR-NEXT:    ret i1 true
 ;
   invoke void @throw_if_bit_set(ptr %q, i8 %bit)
diff --git a/llvm/test/Transforms/FunctionAttrs/nonnull.ll b/llvm/test/Transforms/FunctionAttrs/nonnull.ll
index ec5545b969e5..4432c4f3c541 100644
--- a/llvm/test/Transforms/FunctionAttrs/nonnull.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nonnull.ll
@@ -246,7 +246,7 @@ define ptr @test10(ptr %a, i64 %n) {
 ; ATTRIBUTOR-LABEL: define ptr @test10(
 ; ATTRIBUTOR-SAME: ptr nofree readnone [[A:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:    [[CMP:%.*]] = icmp ne i64 [[N]], 0
-; ATTRIBUTOR-NEXT:    call void @llvm.assume(i1 [[CMP]]) #[[ATTR14:[0-9]+]]
+; ATTRIBUTOR-NEXT:    call void @llvm.assume(i1 [[CMP]]) #[[ATTR13:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    [[B:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[N]]
 ; ATTRIBUTOR-NEXT:    ret ptr [[B]]
 ;
@@ -338,7 +338,7 @@ define internal void @test13(ptr %a, ptr %b, ptr %c) {
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define internal void @test13(
-; ATTRIBUTOR-SAME: ptr nocapture nofree readnone [[A:%.*]], ptr nocapture nofree readnone [[B:%.*]], ptr nocapture nofree readnone [[C:%.*]]) #[[ATTR4:[0-9]+]] {
+; ATTRIBUTOR-SAME: ptr nocapture nofree nonnull readnone [[A:%.*]], ptr nocapture nofree readnone [[B:%.*]], ptr nocapture nofree readnone [[C:%.*]]) #[[ATTR0]] {
 ; ATTRIBUTOR-NEXT:    ret void
 ;
   ret void
@@ -382,7 +382,7 @@ define internal ptr @f1(ptr %arg) {
 ; FNATTRS-NEXT:    ret ptr [[TMP10]]
 ;
 ; ATTRIBUTOR-LABEL: define internal ptr @f1(
-; ATTRIBUTOR-SAME: ptr nofree readonly [[ARG:%.*]]) #[[ATTR5:[0-9]+]] {
+; ATTRIBUTOR-SAME: ptr nofree readonly [[ARG:%.*]]) #[[ATTR4:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:  bb:
 ; ATTRIBUTOR-NEXT:    [[TMP:%.*]] = icmp eq ptr [[ARG]], null
 ; ATTRIBUTOR-NEXT:    br i1 [[TMP]], label [[BB9:%.*]], label [[BB1:%.*]]
@@ -392,11 +392,11 @@ define internal ptr @f1(ptr %arg) {
 ; ATTRIBUTOR-NEXT:    br i1 [[TMP3]], label [[BB6:%.*]], label [[BB4:%.*]]
 ; ATTRIBUTOR:       bb4:
 ; ATTRIBUTOR-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 1
-; ATTRIBUTOR-NEXT:    [[TMP5B:%.*]] = tail call ptr @f3(ptr readonly [[TMP5]]) #[[ATTR15:[0-9]+]]
+; ATTRIBUTOR-NEXT:    [[TMP5B:%.*]] = tail call ptr @f3(ptr nofree nonnull readonly [[TMP5]]) #[[ATTR14:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    [[TMP5C:%.*]] = getelementptr inbounds i32, ptr [[TMP5B]], i64 -1
 ; ATTRIBUTOR-NEXT:    br label [[BB9]]
 ; ATTRIBUTOR:       bb6:
-; ATTRIBUTOR-NEXT:    [[TMP7:%.*]] = tail call ptr @f2(ptr readonly [[ARG]]) #[[ATTR15]]
+; ATTRIBUTOR-NEXT:    [[TMP7:%.*]] = tail call ptr @f2(ptr nofree nonnull readonly [[ARG]]) #[[ATTR14]]
 ; ATTRIBUTOR-NEXT:    ret ptr [[TMP7]]
 ; ATTRIBUTOR:       bb9:
 ; ATTRIBUTOR-NEXT:    [[TMP10:%.*]] = phi ptr [ [[TMP5C]], [[BB4]] ], [ inttoptr (i64 4 to ptr), [[BB:%.*]] ]
@@ -436,9 +436,9 @@ define internal ptr @f2(ptr %arg) {
 ; FNATTRS-NEXT:    ret ptr [[TMP]]
 ;
 ; ATTRIBUTOR-LABEL: define internal ptr @f2(
-; ATTRIBUTOR-SAME: ptr readonly [[ARG:%.*]]) #[[ATTR5]] {
+; ATTRIBUTOR-SAME: ptr nofree nonnull readonly [[ARG:%.*]]) #[[ATTR4]] {
 ; ATTRIBUTOR-NEXT:  bb:
-; ATTRIBUTOR-NEXT:    [[TMP:%.*]] = tail call ptr @f1(ptr readonly [[ARG]]) #[[ATTR15]]
+; ATTRIBUTOR-NEXT:    [[TMP:%.*]] = tail call ptr @f1(ptr nofree nonnull readonly [[ARG]]) #[[ATTR14]]
 ; ATTRIBUTOR-NEXT:    ret ptr [[TMP]]
 ;
 bb:
@@ -457,9 +457,9 @@ define dso_local noalias ptr @f3(ptr %arg) {
 ; FNATTRS-NEXT:    ret ptr [[TMP]]
 ;
 ; ATTRIBUTOR-LABEL: define dso_local noalias ptr @f3(
-; ATTRIBUTOR-SAME: ptr nofree readonly [[ARG:%.*]]) #[[ATTR5]] {
+; ATTRIBUTOR-SAME: ptr nofree readonly [[ARG:%.*]]) #[[ATTR4]] {
 ; ATTRIBUTOR-NEXT:  bb:
-; ATTRIBUTOR-NEXT:    [[TMP:%.*]] = call ptr @f1(ptr nofree readonly [[ARG]]) #[[ATTR15]]
+; ATTRIBUTOR-NEXT:    [[TMP:%.*]] = call ptr @f1(ptr nofree readonly [[ARG]]) #[[ATTR14]]
 ; ATTRIBUTOR-NEXT:    ret ptr [[TMP]]
 ;
 bb:
@@ -508,14 +508,14 @@ define void @f16(ptr %a, ptr %b, i8 %c) {
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define void @f16(
-; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR7:[0-9]+]] {
+; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR6:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:    [[CMP:%.*]] = icmp eq i8 [[C]], 0
 ; ATTRIBUTOR-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; ATTRIBUTOR:       if.then:
-; ATTRIBUTOR-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr nonnull [[B]]) #[[ATTR16:[0-9]+]]
+; ATTRIBUTOR-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr nonnull [[B]]) #[[ATTR15:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    ret void
 ; ATTRIBUTOR:       if.else:
-; ATTRIBUTOR-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr [[B]]) #[[ATTR16]]
+; ATTRIBUTOR-NEXT:    tail call void @fun2(ptr nonnull [[A]], ptr [[B]]) #[[ATTR15]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
   %cmp = icmp eq i8 %c, 0
@@ -550,17 +550,17 @@ define void @f17(ptr %a, i8 %c) {
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define void @f17(
-; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], i8 [[C:%.*]]) #[[ATTR7]] {
+; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], i8 [[C:%.*]]) #[[ATTR6]] {
 ; ATTRIBUTOR-NEXT:    [[CMP:%.*]] = icmp eq i8 [[C]], 0
 ; ATTRIBUTOR-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; ATTRIBUTOR:       if.then:
-; ATTRIBUTOR-NEXT:    tail call void @fun0() #[[ATTR16]]
+; ATTRIBUTOR-NEXT:    tail call void @fun0() #[[ATTR15]]
 ; ATTRIBUTOR-NEXT:    br label [[CONT:%.*]]
 ; ATTRIBUTOR:       if.else:
-; ATTRIBUTOR-NEXT:    tail call void @fun0() #[[ATTR16]]
+; ATTRIBUTOR-NEXT:    tail call void @fun0() #[[ATTR15]]
 ; ATTRIBUTOR-NEXT:    br label [[CONT]]
 ; ATTRIBUTOR:       cont:
-; ATTRIBUTOR-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR16]]
+; ATTRIBUTOR-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR15]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
   %cmp = icmp eq i8 %c, 0
@@ -611,26 +611,26 @@ define void @f18(ptr %a, ptr %b, i8 %c) {
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define void @f18(
-; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR7]] {
+; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]]) #[[ATTR6]] {
 ; ATTRIBUTOR-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[C]], 0
 ; ATTRIBUTOR-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; ATTRIBUTOR:       if.then:
-; ATTRIBUTOR-NEXT:    tail call void @fun0() #[[ATTR16]]
+; ATTRIBUTOR-NEXT:    tail call void @fun0() #[[ATTR15]]
 ; ATTRIBUTOR-NEXT:    br label [[CONT:%.*]]
 ; ATTRIBUTOR:       if.else:
-; ATTRIBUTOR-NEXT:    tail call void @fun0() #[[ATTR16]]
+; ATTRIBUTOR-NEXT:    tail call void @fun0() #[[ATTR15]]
 ; ATTRIBUTOR-NEXT:    br label [[CONT]]
 ; ATTRIBUTOR:       cont:
 ; ATTRIBUTOR-NEXT:    [[CMP2:%.*]] = icmp eq i8 [[C]], 1
 ; ATTRIBUTOR-NEXT:    br i1 [[CMP2]], label [[CONT_THEN:%.*]], label [[CONT_ELSE:%.*]]
 ; ATTRIBUTOR:       cont.then:
-; ATTRIBUTOR-NEXT:    tail call void @fun1(ptr nonnull [[B]]) #[[ATTR16]]
+; ATTRIBUTOR-NEXT:    tail call void @fun1(ptr nonnull [[B]]) #[[ATTR15]]
 ; ATTRIBUTOR-NEXT:    br label [[CONT2:%.*]]
 ; ATTRIBUTOR:       cont.else:
-; ATTRIBUTOR-NEXT:    tail call void @fun0() #[[ATTR16]]
+; ATTRIBUTOR-NEXT:    tail call void @fun0() #[[ATTR15]]
 ; ATTRIBUTOR-NEXT:    br label [[CONT2]]
 ; ATTRIBUTOR:       cont2:
-; ATTRIBUTOR-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR16]]
+; ATTRIBUTOR-NEXT:    tail call void @fun1(ptr nonnull [[A]]) #[[ATTR15]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
   %cmp1 = icmp eq i8 %c, 0
@@ -674,7 +674,7 @@ define void @f19(ptr %a, ptr %b, i8 %c) {
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define void @f19(
-; ATTRIBUTOR-SAME: ptr [[A:%.*]], ptr nonnull [[B:%.*]], i8 [[C:%.*]]) #[[ATTR8:[0-9]+]] {
+; ATTRIBUTOR-SAME: ptr [[A:%.*]], ptr nonnull [[B:%.*]], i8 [[C:%.*]]) #[[ATTR7:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; ATTRIBUTOR:       loop.header:
 ; ATTRIBUTOR-NEXT:    [[CMP2:%.*]] = icmp eq i8 [[C]], 0
@@ -883,7 +883,7 @@ define i8 @parent7(ptr %a) {
 ;
 ; ATTRIBUTOR-LABEL: define i8 @parent7(
 ; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]]) {
-; ATTRIBUTOR-NEXT:    [[RET:%.*]] = call i8 @use1safecall(ptr nonnull [[A]]) #[[ATTR16]]
+; ATTRIBUTOR-NEXT:    [[RET:%.*]] = call i8 @use1safecall(ptr nonnull [[A]]) #[[ATTR15]]
 ; ATTRIBUTOR-NEXT:    call void @use1nonnull(ptr nonnull [[A]])
 ; ATTRIBUTOR-NEXT:    ret i8 [[RET]]
 ;
@@ -915,7 +915,7 @@ define i1 @parent8(ptr %a, ptr %bogus1, ptr %b) personality ptr @esfp{
 ; FNATTRS-NEXT:    unreachable
 ;
 ; ATTRIBUTOR-LABEL: define i1 @parent8(
-; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], ptr nocapture nofree readnone [[BOGUS1:%.*]], ptr nonnull [[B:%.*]]) #[[ATTR8]] personality ptr @esfp {
+; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], ptr nocapture nofree readnone [[BOGUS1:%.*]], ptr nonnull [[B:%.*]]) #[[ATTR7]] personality ptr @esfp {
 ; ATTRIBUTOR-NEXT:  entry:
 ; ATTRIBUTOR-NEXT:    invoke void @use2nonnull(ptr nonnull [[A]], ptr nonnull [[B]])
 ; ATTRIBUTOR-NEXT:            to label [[CONT:%.*]] unwind label [[EXC:%.*]]
@@ -965,7 +965,7 @@ define ptr @gep1_no_null_opt(ptr %p) #0 {
 ; FNATTRS-NEXT:    ret ptr [[Q]]
 ;
 ; ATTRIBUTOR-LABEL: define ptr @gep1_no_null_opt(
-; ATTRIBUTOR-SAME: ptr nofree readnone [[P:%.*]]) #[[ATTR9:[0-9]+]] {
+; ATTRIBUTOR-SAME: ptr nofree readnone [[P:%.*]]) #[[ATTR8:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:    [[Q:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 1
 ; ATTRIBUTOR-NEXT:    ret ptr [[Q]]
 ;
@@ -1006,8 +1006,8 @@ define internal ptr @g2() {
 ; FNATTRS-SAME: ) #[[ATTR0]] {
 ; FNATTRS-NEXT:    ret ptr inttoptr (i64 4 to ptr)
 ;
-; ATTRIBUTOR-LABEL: define internal ptr @g2(
-; ATTRIBUTOR-SAME: ) #[[ATTR10:[0-9]+]] {
+; ATTRIBUTOR-LABEL: define internal nonnull ptr @g2(
+; ATTRIBUTOR-SAME: ) #[[ATTR0]] {
 ; ATTRIBUTOR-NEXT:    ret ptr inttoptr (i64 4 to ptr)
 ;
   ret ptr inttoptr (i64 4 to ptr)
@@ -1021,7 +1021,7 @@ define  ptr @g1() {
 ;
 ; ATTRIBUTOR-LABEL: define ptr @g1(
 ; ATTRIBUTOR-SAME: ) #[[ATTR0]] {
-; ATTRIBUTOR-NEXT:    [[C:%.*]] = call ptr @g2() #[[ATTR10]]
+; ATTRIBUTOR-NEXT:    [[C:%.*]] = call ptr @g2() #[[ATTR16:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    ret ptr [[C]]
 ;
   %c = call ptr @g2()
@@ -1036,8 +1036,8 @@ define internal void @called_by_weak(ptr %a) {
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define internal void @called_by_weak(
-; ATTRIBUTOR-SAME: ptr nocapture readnone [[A:%.*]]) #[[ATTR11:[0-9]+]] {
-; ATTRIBUTOR-NEXT:    call void @use_i32_ptr(ptr [[A]])
+; ATTRIBUTOR-SAME: ptr nocapture nonnull readnone [[A:%.*]]) #[[ATTR10:[0-9]+]] {
+; ATTRIBUTOR-NEXT:    call void @use_i32_ptr(ptr nonnull [[A]]) #[[ATTR17:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
   call void @use_i32_ptr(ptr %a)
@@ -1068,8 +1068,8 @@ define internal void @control(ptr dereferenceable(4) %a) {
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define internal void @control(
-; ATTRIBUTOR-SAME: ptr nocapture readnone dereferenceable(4) [[A:%.*]]) #[[ATTR11]] {
-; ATTRIBUTOR-NEXT:    call void @use_i32_ptr(ptr [[A]])
+; ATTRIBUTOR-SAME: ptr nocapture nonnull readnone dereferenceable(4) [[A:%.*]]) #[[ATTR10]] {
+; ATTRIBUTOR-NEXT:    call void @use_i32_ptr(ptr [[A]]) #[[ATTR17]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
   call void @use_i32_ptr(ptr %a)
@@ -1083,7 +1083,7 @@ define internal void @naked(ptr dereferenceable(4) %a) naked {
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define internal void @naked(
-; ATTRIBUTOR-SAME: ptr dereferenceable(4) [[A:%.*]]) #[[ATTR12:[0-9]+]] {
+; ATTRIBUTOR-SAME: ptr nonnull dereferenceable(4) [[A:%.*]]) #[[ATTR11:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:    call void @use_i32_ptr(ptr [[A]])
 ; ATTRIBUTOR-NEXT:    ret void
 ;
@@ -1098,7 +1098,7 @@ define internal void @optnone(ptr dereferenceable(4) %a) optnone noinline {
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define internal void @optnone(
-; ATTRIBUTOR-SAME: ptr dereferenceable(4) [[A:%.*]]) #[[ATTR13:[0-9]+]] {
+; ATTRIBUTOR-SAME: ptr nonnull dereferenceable(4) [[A:%.*]]) #[[ATTR12:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:    call void @use_i32_ptr(ptr [[A]])
 ; ATTRIBUTOR-NEXT:    ret void
 ;
@@ -1135,35 +1135,20 @@ define void @make_live(ptr nonnull dereferenceable(8) %a) {
 declare void @h(ptr) willreturn nounwind
 declare i32 @g(ptr) willreturn nounwind
 define i32 @nonnull_exec_ctx_1(ptr %a, i32 %b) {
-; FNATTRS-LABEL: define i32 @nonnull_exec_ctx_1(
-; FNATTRS-SAME: ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
-; FNATTRS-NEXT:  en:
-; FNATTRS-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
-; FNATTRS-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
-; FNATTRS:       ex:
-; FNATTRS-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]])
-; FNATTRS-NEXT:    ret i32 [[TMP5]]
-; FNATTRS:       hd:
-; FNATTRS-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD]] ], [ 0, [[EN:%.*]] ]
-; FNATTRS-NEXT:    tail call void @h(ptr [[A]])
-; FNATTRS-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
-; FNATTRS-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
-; FNATTRS-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
-;
-; ATTRIBUTOR-LABEL: define i32 @nonnull_exec_ctx_1(
-; ATTRIBUTOR-SAME: ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
-; ATTRIBUTOR-NEXT:  en:
-; ATTRIBUTOR-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
-; ATTRIBUTOR-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
-; ATTRIBUTOR:       ex:
-; ATTRIBUTOR-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]])
-; ATTRIBUTOR-NEXT:    ret i32 [[TMP5]]
-; ATTRIBUTOR:       hd:
-; ATTRIBUTOR-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD]] ], [ 0, [[EN:%.*]] ]
-; ATTRIBUTOR-NEXT:    tail call void @h(ptr [[A]])
-; ATTRIBUTOR-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
-; ATTRIBUTOR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
-; ATTRIBUTOR-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
+; COMMON-LABEL: define i32 @nonnull_exec_ctx_1(
+; COMMON-SAME: ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7:[0-9]+]] {
+; COMMON-NEXT:  en:
+; COMMON-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
+; COMMON-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
+; COMMON:       ex:
+; COMMON-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]])
+; COMMON-NEXT:    ret i32 [[TMP5]]
+; COMMON:       hd:
+; COMMON-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD]] ], [ 0, [[EN:%.*]] ]
+; COMMON-NEXT:    tail call void @h(ptr [[A]])
+; COMMON-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
+; COMMON-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
+; COMMON-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
 en:
   %tmp3 = icmp eq i32 %b, 0
@@ -1182,39 +1167,22 @@ hd:
 }
 
 define i32 @nonnull_exec_ctx_1b(ptr %a, i32 %b) {
-; FNATTRS-LABEL: define i32 @nonnull_exec_ctx_1b(
-; FNATTRS-SAME: ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
-; FNATTRS-NEXT:  en:
-; FNATTRS-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
-; FNATTRS-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
-; FNATTRS:       ex:
-; FNATTRS-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]])
-; FNATTRS-NEXT:    ret i32 [[TMP5]]
-; FNATTRS:       hd:
-; FNATTRS-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD2:%.*]] ], [ 0, [[EN:%.*]] ]
-; FNATTRS-NEXT:    tail call void @h(ptr [[A]])
-; FNATTRS-NEXT:    br label [[HD2]]
-; FNATTRS:       hd2:
-; FNATTRS-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
-; FNATTRS-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
-; FNATTRS-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
-;
-; ATTRIBUTOR-LABEL: define i32 @nonnull_exec_ctx_1b(
-; ATTRIBUTOR-SAME: ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR8]] {
-; ATTRIBUTOR-NEXT:  en:
-; ATTRIBUTOR-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
-; ATTRIBUTOR-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
-; ATTRIBUTOR:       ex:
-; ATTRIBUTOR-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]])
-; ATTRIBUTOR-NEXT:    ret i32 [[TMP5]]
-; ATTRIBUTOR:       hd:
-; ATTRIBUTOR-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD2:%.*]] ], [ 0, [[EN:%.*]] ]
-; ATTRIBUTOR-NEXT:    tail call void @h(ptr [[A]])
-; ATTRIBUTOR-NEXT:    br label [[HD2]]
-; ATTRIBUTOR:       hd2:
-; ATTRIBUTOR-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
-; ATTRIBUTOR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
-; ATTRIBUTOR-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
+; COMMON-LABEL: define i32 @nonnull_exec_ctx_1b(
+; COMMON-SAME: ptr [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
+; COMMON-NEXT:  en:
+; COMMON-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
+; COMMON-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
+; COMMON:       ex:
+; COMMON-NEXT:    [[TMP5:%.*]] = tail call i32 @g(ptr nonnull [[A]])
+; COMMON-NEXT:    ret i32 [[TMP5]]
+; COMMON:       hd:
+; COMMON-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD2:%.*]] ], [ 0, [[EN:%.*]] ]
+; COMMON-NEXT:    tail call void @h(ptr [[A]])
+; COMMON-NEXT:    br label [[HD2]]
+; COMMON:       hd2:
+; COMMON-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
+; COMMON-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
+; COMMON-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
 en:
   %tmp3 = icmp eq i32 %b, 0
@@ -1252,7 +1220,7 @@ define i32 @nonnull_exec_ctx_2(ptr %a, i32 %b) willreturn nounwind {
 ; FNATTRS-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
 ; ATTRIBUTOR-LABEL: define i32 @nonnull_exec_ctx_2(
-; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
+; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR6]] {
 ; ATTRIBUTOR-NEXT:  en:
 ; ATTRIBUTOR-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
 ; ATTRIBUTOR-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
@@ -1301,7 +1269,7 @@ define i32 @nonnull_exec_ctx_2b(ptr %a, i32 %b) willreturn nounwind {
 ; FNATTRS-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
 ; ATTRIBUTOR-LABEL: define i32 @nonnull_exec_ctx_2b(
-; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR7]] {
+; ATTRIBUTOR-SAME: ptr nonnull [[A:%.*]], i32 [[B:%.*]]) #[[ATTR6]] {
 ; ATTRIBUTOR-NEXT:  en:
 ; ATTRIBUTOR-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B]], 0
 ; ATTRIBUTOR-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
diff --git a/llvm/test/Transforms/FunctionAttrs/norecurse.ll b/llvm/test/Transforms/FunctionAttrs/norecurse.ll
index 7924428fb498..a902974fed28 100644
--- a/llvm/test/Transforms/FunctionAttrs/norecurse.ll
+++ b/llvm/test/Transforms/FunctionAttrs/norecurse.ll
@@ -4,10 +4,15 @@
 
 
 define i32 @leaf() {
-; COMMON: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; COMMON-LABEL: define {{[^@]+}}@leaf
-; COMMON-SAME: () #[[ATTR0:[0-9]+]] {
-; COMMON-NEXT:    ret i32 1
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; FNATTRS-LABEL: define {{[^@]+}}@leaf
+; FNATTRS-SAME: () #[[ATTR0:[0-9]+]] {
+; FNATTRS-NEXT:    ret i32 1
+;
+; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; ATTRIBUTOR-LABEL: define {{[^@]+}}@leaf
+; ATTRIBUTOR-SAME: () #[[ATTR0:[0-9]+]] {
+; ATTRIBUTOR-NEXT:    ret i32 1
 ;
   ret i32 1
 }
@@ -108,9 +113,9 @@ define internal i32 @called_by_norecurse() {
 ; FNATTRS-NEXT:    [[A:%.*]] = call i32 @k()
 ; FNATTRS-NEXT:    ret i32 [[A]]
 ;
-; ATTRIBUTOR: Function Attrs: nosync memory(none)
+; ATTRIBUTOR: Function Attrs: norecurse nosync memory(none)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@called_by_norecurse
-; ATTRIBUTOR-SAME: () #[[ATTR2]] {
+; ATTRIBUTOR-SAME: () #[[ATTR6:[0-9]+]] {
 ; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @k() #[[ATTR7]]
 ; ATTRIBUTOR-NEXT:    ret i32 [[A]]
 ;
@@ -127,7 +132,7 @@ define void @m() norecurse {
 ;
 ; ATTRIBUTOR: Function Attrs: norecurse nosync memory(none)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@m
-; ATTRIBUTOR-SAME: () #[[ATTR6:[0-9]+]] {
+; ATTRIBUTOR-SAME: () #[[ATTR6]] {
 ; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @called_by_norecurse() #[[ATTR2]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
@@ -142,9 +147,9 @@ define internal i32 @called_by_norecurse_indirectly() {
 ; FNATTRS-NEXT:    [[A:%.*]] = call i32 @k()
 ; FNATTRS-NEXT:    ret i32 [[A]]
 ;
-; ATTRIBUTOR: Function Attrs: nosync memory(none)
+; ATTRIBUTOR: Function Attrs: norecurse nosync memory(none)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@called_by_norecurse_indirectly
-; ATTRIBUTOR-SAME: () #[[ATTR2]] {
+; ATTRIBUTOR-SAME: () #[[ATTR6]] {
 ; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @k() #[[ATTR7]]
 ; ATTRIBUTOR-NEXT:    ret i32 [[A]]
 ;
@@ -159,9 +164,9 @@ define internal void @o() {
 ; FNATTRS-NEXT:    [[A:%.*]] = call i32 @called_by_norecurse_indirectly()
 ; FNATTRS-NEXT:    ret void
 ;
-; ATTRIBUTOR: Function Attrs: nosync memory(none)
+; ATTRIBUTOR: Function Attrs: norecurse nosync memory(none)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@o
-; ATTRIBUTOR-SAME: () #[[ATTR2]] {
+; ATTRIBUTOR-SAME: () #[[ATTR6]] {
 ; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @called_by_norecurse_indirectly() #[[ATTR2]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
@@ -213,7 +218,7 @@ define internal void @q() {
 ; ATTRIBUTOR: Function Attrs: norecurse nosync memory(none)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@q
 ; ATTRIBUTOR-SAME: () #[[ATTR6]] {
-; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @escapes_as_parameter(ptr nonnull @escapes_as_parameter) #[[ATTR2]]
+; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @escapes_as_parameter(ptr nocapture nofree nonnull readnone @escapes_as_parameter) #[[ATTR2]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
   %a = call i32 @escapes_as_parameter(ptr @escapes_as_parameter)
@@ -255,3 +260,5 @@ define void @r() norecurse {
 ; ATTRIBUTOR: attributes #[[ATTR7]] = { nosync }
 ; ATTRIBUTOR: attributes #[[ATTR8]] = { nofree willreturn }
 ;.
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; COMMON: {{.*}}
diff --git a/llvm/test/Transforms/FunctionAttrs/read-write-scc.ll b/llvm/test/Transforms/FunctionAttrs/read-write-scc.ll
index 3640eb59b884..be61990fd627 100644
--- a/llvm/test/Transforms/FunctionAttrs/read-write-scc.ll
+++ b/llvm/test/Transforms/FunctionAttrs/read-write-scc.ll
@@ -4,7 +4,7 @@
 @i = global i32 0
 
 define void @foo() {
-; CHECK: Function Attrs: nofree nosync nounwind
+; CHECK: Function Attrs: nofree nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none)
 ; CHECK-LABEL: define {{[^@]+}}@foo
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    store i32 1, ptr @i, align 4
@@ -17,7 +17,7 @@ define void @foo() {
 }
 
 define void @bar() {
-; CHECK: Function Attrs: nofree nosync nounwind
+; CHECK: Function Attrs: nofree nosync nounwind memory(readwrite, argmem: none, inaccessiblemem: none)
 ; CHECK-LABEL: define {{[^@]+}}@bar
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:    [[I:%.*]] = load i32, ptr @i, align 4
diff --git a/llvm/test/Transforms/FunctionAttrs/willreturn.ll b/llvm/test/Transforms/FunctionAttrs/willreturn.ll
index bf3f4adf7eaa..70926345ce27 100644
--- a/llvm/test/Transforms/FunctionAttrs/willreturn.ll
+++ b/llvm/test/Transforms/FunctionAttrs/willreturn.ll
@@ -102,23 +102,23 @@ define i64 @mustprogress_mayunwind() mustprogress personality ptr @__gxx_persona
 ; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
 ; FNATTRS-LABEL: @mustprogress_mayunwind(
 ; FNATTRS-NEXT:    [[A:%.*]] = invoke i64 @fn_noread()
-; FNATTRS-NEXT:    to label [[A:%.*]] unwind label [[B:%.*]]
+; FNATTRS-NEXT:            to label [[A:%.*]] unwind label [[B:%.*]]
 ; FNATTRS:       A:
 ; FNATTRS-NEXT:    ret i64 10
 ; FNATTRS:       B:
 ; FNATTRS-NEXT:    [[VAL:%.*]] = landingpad { ptr, i32 }
-; FNATTRS-NEXT:    catch ptr null
+; FNATTRS-NEXT:            catch ptr null
 ; FNATTRS-NEXT:    ret i64 0
 ;
 ; ATTRIBUTOR: Function Attrs: mustprogress nosync nounwind willreturn memory(none)
 ; ATTRIBUTOR-LABEL: @mustprogress_mayunwind(
-; ATTRIBUTOR-NEXT:    [[A:%.*]] = invoke i64 @fn_noread()
-; ATTRIBUTOR-NEXT:    to label [[A:%.*]] unwind label [[B:%.*]]
+; ATTRIBUTOR-NEXT:    [[A:%.*]] = invoke i64 @fn_noread() #[[ATTR13:[0-9]+]]
+; ATTRIBUTOR-NEXT:            to label [[A:%.*]] unwind label [[B:%.*]]
 ; ATTRIBUTOR:       A:
 ; ATTRIBUTOR-NEXT:    ret i64 10
 ; ATTRIBUTOR:       B:
 ; ATTRIBUTOR-NEXT:    [[VAL:%.*]] = landingpad { ptr, i32 }
-; ATTRIBUTOR-NEXT:    catch ptr null
+; ATTRIBUTOR-NEXT:            catch ptr null
 ; ATTRIBUTOR-NEXT:    ret i64 0
 ;
   %a = invoke i64 @fn_noread()
diff --git a/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll b/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll
index c5f656c870a2..99541b398226 100644
--- a/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll
+++ b/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll
@@ -265,16 +265,17 @@ define i32 @test5(ptr %a, i32 %b) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[B:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY:%.*]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[SUM_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i64 [[INDVARS_IV]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[SUM_0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[SUM_0]], [[TMP2]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    br label [[FOR_COND]]
 ; CHECK:       for.end:
@@ -349,22 +350,23 @@ define i32 @test7(ptr %a, i32 %b) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[B:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[B]], i32 -1)
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[SMAX]], 2
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[SMAX]], 2
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[TMP2]] to i64
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY:%.*]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[SUM_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i64 [[INDVARS_IV]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[SUM_0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[SUM_0]], [[TMP3]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND]], label [[FOR_END]]
+; CHECK-NEXT:    [[EXITCOND2:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND2]], label [[FOR_COND]], label [[FOR_END]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ [[SUM_0]], [[FOR_BODY]] ], [ [[SUM_0]], [[FOR_COND]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
diff --git a/llvm/test/Transforms/InstCombine/load-cmp.ll b/llvm/test/Transforms/InstCombine/load-cmp.ll
index e941284a798e..b956de29e0b8 100644
--- a/llvm/test/Transforms/InstCombine/load-cmp.ll
+++ b/llvm/test/Transforms/InstCombine/load-cmp.ll
@@ -334,3 +334,20 @@ define i1 @test10_struct_arr_noinbounds_i64(i64 %x) {
   %r = icmp eq i32 %q, 9
   ret i1 %r
 }
+
+@table = internal constant [2 x ptr] [ptr @g, ptr getelementptr (i8, ptr @g, i64 4)], align 16
+@g = external global [2 x i32]
+
+define i1 @pr93017(i64 %idx) {
+; CHECK-LABEL: @pr93017(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[IDX:%.*]] to i32
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [2 x ptr], ptr @table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne ptr [[V]], null
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %gep = getelementptr inbounds [2 x ptr], ptr @table, i64 0, i64 %idx
+  %v = load ptr, ptr %gep
+  %cmp = icmp ne ptr %v, null
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
index 54348d1e2a48..24d624c221f4 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt < %s -interleaved-access -S | FileCheck %s --check-prefix=NEON
-; RUN: opt < %s -interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -force-streaming-compatible-sve -S | FileCheck %s --check-prefix=SVE-FIXED
+; RUN: opt < %s -interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -force-streaming-compatible -S | FileCheck %s --check-prefix=SVE-FIXED
 ; RUN: opt < %s -passes=interleaved-access -S | FileCheck %s --check-prefix=NEON
-; RUN: opt < %s -passes=interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -force-streaming-compatible-sve -S | FileCheck %s --check-prefix=SVE-FIXED
+; RUN: opt < %s -passes=interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -force-streaming-compatible -S | FileCheck %s --check-prefix=SVE-FIXED
 
 target triple = "aarch64-linux-gnu"
 
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/pr40514.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/pr40514.ll
index 03b1aece9e87..a6bff63dfc71 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/pr40514.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/pr40514.ll
@@ -54,4 +54,4 @@ bb10:                                             ; preds = %bb10, %bb
 }
 
 
-attributes #0 = { "target-cpu"="broadwell" "target-features"="+sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,-ptwrite,-xsavec,+popcnt,+aes,-avx512bitalg,-movdiri,-xsaves,-avx512er,-avx512vnni,-avx512vpopcntdq,-pconfig,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-movdir64b,-sse4a,-avx512bw,-clflushopt,+xsave,-avx512vbmi2,+64bit,-avx512vl,+invpcid,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,-sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3" }
+attributes #0 = { "target-cpu"="broadwell" "target-features"="+sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,-ptwrite,-xsavec,+popcnt,+aes,-avx512bitalg,-movdiri,-xsaves,-avx512vnni,-avx512vpopcntdq,-pconfig,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-movdir64b,-sse4a,-avx512bw,-clflushopt,+xsave,-avx512vbmi2,+64bit,-avx512vl,+invpcid,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,+f16c,+ssse3,-sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 9d8d7036d4f4..a74b0b441771 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -634,6 +634,247 @@ exit:
   ret void
 }
 
+define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, ptr %E, i64 %N) "target-features"="+sve" {
+; DEFAULT-LABEL: define i32 @header_mask_and_invariant_compare(
+; DEFAULT-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], ptr [[E:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; DEFAULT-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; DEFAULT-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 64, i64 [[TMP2]])
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; DEFAULT:       vector.memcheck:
+; DEFAULT-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[E]], i64 4
+; DEFAULT-NEXT:    [[TMP4:%.*]] = shl i64 [[N]], 2
+; DEFAULT-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 4
+; DEFAULT-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP5]]
+; DEFAULT-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 4
+; DEFAULT-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[B]], i64 4
+; DEFAULT-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[C]], i64 4
+; DEFAULT-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[E]], [[SCEVGEP1]]
+; DEFAULT-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[D]], [[SCEVGEP]]
+; DEFAULT-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; DEFAULT-NEXT:    [[BOUND05:%.*]] = icmp ult ptr [[E]], [[SCEVGEP2]]
+; DEFAULT-NEXT:    [[BOUND16:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]]
+; DEFAULT-NEXT:    [[FOUND_CONFLICT7:%.*]] = and i1 [[BOUND05]], [[BOUND16]]
+; DEFAULT-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT7]]
+; DEFAULT-NEXT:    [[BOUND08:%.*]] = icmp ult ptr [[E]], [[SCEVGEP3]]
+; DEFAULT-NEXT:    [[BOUND19:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
+; DEFAULT-NEXT:    [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]]
+; DEFAULT-NEXT:    [[CONFLICT_RDX11:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT10]]
+; DEFAULT-NEXT:    [[BOUND012:%.*]] = icmp ult ptr [[E]], [[SCEVGEP4]]
+; DEFAULT-NEXT:    [[BOUND113:%.*]] = icmp ult ptr [[C]], [[SCEVGEP]]
+; DEFAULT-NEXT:    [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
+; DEFAULT-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX11]], [[FOUND_CONFLICT14]]
+; DEFAULT-NEXT:    [[BOUND016:%.*]] = icmp ult ptr [[D]], [[SCEVGEP2]]
+; DEFAULT-NEXT:    [[BOUND117:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
+; DEFAULT-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; DEFAULT-NEXT:    [[CONFLICT_RDX19:%.*]] = or i1 [[CONFLICT_RDX15]], [[FOUND_CONFLICT18]]
+; DEFAULT-NEXT:    [[BOUND020:%.*]] = icmp ult ptr [[D]], [[SCEVGEP3]]
+; DEFAULT-NEXT:    [[BOUND121:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]]
+; DEFAULT-NEXT:    [[FOUND_CONFLICT22:%.*]] = and i1 [[BOUND020]], [[BOUND121]]
+; DEFAULT-NEXT:    [[CONFLICT_RDX23:%.*]] = or i1 [[CONFLICT_RDX19]], [[FOUND_CONFLICT22]]
+; DEFAULT-NEXT:    [[BOUND024:%.*]] = icmp ult ptr [[D]], [[SCEVGEP4]]
+; DEFAULT-NEXT:    [[BOUND125:%.*]] = icmp ult ptr [[C]], [[SCEVGEP1]]
+; DEFAULT-NEXT:    [[FOUND_CONFLICT26:%.*]] = and i1 [[BOUND024]], [[BOUND125]]
+; DEFAULT-NEXT:    [[CONFLICT_RDX27:%.*]] = or i1 [[CONFLICT_RDX23]], [[FOUND_CONFLICT26]]
+; DEFAULT-NEXT:    br i1 [[CONFLICT_RDX27]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; DEFAULT:       vector.ph:
+; DEFAULT-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP7]]
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT32:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[E]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT33:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT32]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
+; DEFAULT:       vector.body:
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; DEFAULT-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META9:![0-9]+]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP11]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT29:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT28]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META12:![0-9]+]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP12]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP13:%.*]] = or <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT29]]
+; DEFAULT-NEXT:    [[TMP14:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META14:![0-9]+]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT30:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP14]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT31:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT30]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP15:%.*]] = icmp ugt <vscale x 4 x i32> [[BROADCAST_SPLAT31]], [[TMP13]]
+; DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[D]], i64 [[TMP10]]
+; DEFAULT-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP13]], <vscale x 4 x ptr> [[BROADCAST_SPLAT33]], i32 4, <vscale x 4 x i1> [[TMP15]]), !alias.scope [[META16:![0-9]+]], !noalias [[META18:![0-9]+]]
+; DEFAULT-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[TMP16]], i32 0
+; DEFAULT-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> zeroinitializer, ptr [[TMP17]], i32 4, <vscale x 4 x i1> [[TMP15]]), !alias.scope [[META20:![0-9]+]], !noalias [[META21:![0-9]+]]
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; DEFAULT-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; DEFAULT:       middle.block:
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT:       scalar.ph:
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; DEFAULT-NEXT:    br label [[LOOP_HEADER:%.*]]
+; DEFAULT:       loop.header:
+; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; DEFAULT-NEXT:    [[L_A:%.*]] = load i32, ptr [[A]], align 4
+; DEFAULT-NEXT:    [[L_B:%.*]] = load i32, ptr [[B]], align 4
+; DEFAULT-NEXT:    [[OR:%.*]] = or i32 [[L_B]], [[L_A]]
+; DEFAULT-NEXT:    [[L_C:%.*]] = load i32, ptr [[C]], align 4
+; DEFAULT-NEXT:    [[C_0:%.*]] = icmp ugt i32 [[L_C]], [[OR]]
+; DEFAULT-NEXT:    br i1 [[C_0]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
+; DEFAULT:       if.then:
+; DEFAULT-NEXT:    [[GEP_D:%.*]] = getelementptr i32, ptr [[D]], i64 [[IV]]
+; DEFAULT-NEXT:    store i32 [[OR]], ptr [[E]], align 4
+; DEFAULT-NEXT:    store i32 0, ptr [[GEP_D]], align 4
+; DEFAULT-NEXT:    br label [[LOOP_LATCH]]
+; DEFAULT:       loop.latch:
+; DEFAULT-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; DEFAULT-NEXT:    [[C_1:%.*]] = icmp eq i64 [[IV]], [[N]]
+; DEFAULT-NEXT:    br i1 [[C_1]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP23:![0-9]+]]
+; DEFAULT:       exit:
+; DEFAULT-NEXT:    ret i32 0
+;
+; PRED-LABEL: define i32 @header_mask_and_invariant_compare(
+; PRED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], ptr [[E:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
+; PRED-NEXT:  entry:
+; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; PRED:       vector.memcheck:
+; PRED-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[E]], i64 4
+; PRED-NEXT:    [[TMP1:%.*]] = shl i64 [[N]], 2
+; PRED-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 4
+; PRED-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[D]], i64 [[TMP2]]
+; PRED-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 4
+; PRED-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[B]], i64 4
+; PRED-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[C]], i64 4
+; PRED-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[E]], [[SCEVGEP1]]
+; PRED-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[D]], [[SCEVGEP]]
+; PRED-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; PRED-NEXT:    [[BOUND05:%.*]] = icmp ult ptr [[E]], [[SCEVGEP2]]
+; PRED-NEXT:    [[BOUND16:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]]
+; PRED-NEXT:    [[FOUND_CONFLICT7:%.*]] = and i1 [[BOUND05]], [[BOUND16]]
+; PRED-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT7]]
+; PRED-NEXT:    [[BOUND08:%.*]] = icmp ult ptr [[E]], [[SCEVGEP3]]
+; PRED-NEXT:    [[BOUND19:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
+; PRED-NEXT:    [[FOUND_CONFLICT10:%.*]] = and i1 [[BOUND08]], [[BOUND19]]
+; PRED-NEXT:    [[CONFLICT_RDX11:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT10]]
+; PRED-NEXT:    [[BOUND012:%.*]] = icmp ult ptr [[E]], [[SCEVGEP4]]
+; PRED-NEXT:    [[BOUND113:%.*]] = icmp ult ptr [[C]], [[SCEVGEP]]
+; PRED-NEXT:    [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
+; PRED-NEXT:    [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX11]], [[FOUND_CONFLICT14]]
+; PRED-NEXT:    [[BOUND016:%.*]] = icmp ult ptr [[D]], [[SCEVGEP2]]
+; PRED-NEXT:    [[BOUND117:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
+; PRED-NEXT:    [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]]
+; PRED-NEXT:    [[CONFLICT_RDX19:%.*]] = or i1 [[CONFLICT_RDX15]], [[FOUND_CONFLICT18]]
+; PRED-NEXT:    [[BOUND020:%.*]] = icmp ult ptr [[D]], [[SCEVGEP3]]
+; PRED-NEXT:    [[BOUND121:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]]
+; PRED-NEXT:    [[FOUND_CONFLICT22:%.*]] = and i1 [[BOUND020]], [[BOUND121]]
+; PRED-NEXT:    [[CONFLICT_RDX23:%.*]] = or i1 [[CONFLICT_RDX19]], [[FOUND_CONFLICT22]]
+; PRED-NEXT:    [[BOUND024:%.*]] = icmp ult ptr [[D]], [[SCEVGEP4]]
+; PRED-NEXT:    [[BOUND125:%.*]] = icmp ult ptr [[C]], [[SCEVGEP1]]
+; PRED-NEXT:    [[FOUND_CONFLICT26:%.*]] = and i1 [[BOUND024]], [[BOUND125]]
+; PRED-NEXT:    [[CONFLICT_RDX27:%.*]] = or i1 [[CONFLICT_RDX23]], [[FOUND_CONFLICT26]]
+; PRED-NEXT:    br i1 [[CONFLICT_RDX27]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; PRED:       vector.ph:
+; PRED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; PRED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; PRED-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], 1
+; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], [[TMP7]]
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; PRED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; PRED-NEXT:    [[TMP12:%.*]] = sub i64 [[TMP0]], [[TMP11]]
+; PRED-NEXT:    [[TMP13:%.*]] = icmp ugt i64 [[TMP0]], [[TMP11]]
+; PRED-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT32:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[E]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT33:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT32]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; PRED:       vector.body:
+; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 0
+; PRED-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META6:![0-9]+]]
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP16]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT29:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT28]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP17:%.*]] = load i32, ptr [[B]], align 4, !alias.scope [[META9:![0-9]+]]
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP17]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP18:%.*]] = or <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT29]]
+; PRED-NEXT:    [[TMP19:%.*]] = load i32, ptr [[C]], align 4, !alias.scope [[META11:![0-9]+]]
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT30:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP19]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT31:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT30]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP20:%.*]] = icmp ugt <vscale x 4 x i32> [[BROADCAST_SPLAT31]], [[TMP18]]
+; PRED-NEXT:    [[TMP21:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP20]], <vscale x 4 x i1> zeroinitializer
+; PRED-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[D]], i64 [[TMP15]]
+; PRED-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP18]], <vscale x 4 x ptr> [[BROADCAST_SPLAT33]], i32 4, <vscale x 4 x i1> [[TMP21]]), !alias.scope [[META13:![0-9]+]], !noalias [[META15:![0-9]+]]
+; PRED-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0
+; PRED-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> zeroinitializer, ptr [[TMP23]], i32 4, <vscale x 4 x i1> [[TMP21]]), !alias.scope [[META17:![0-9]+]], !noalias [[META18:![0-9]+]]
+; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP14]])
+; PRED-NEXT:    [[TMP24:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP25:%.*]] = extractelement <vscale x 4 x i1> [[TMP24]], i32 0
+; PRED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; PRED:       middle.block:
+; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED:       scalar.ph:
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; PRED-NEXT:    br label [[LOOP_HEADER:%.*]]
+; PRED:       loop.header:
+; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; PRED-NEXT:    [[L_A:%.*]] = load i32, ptr [[A]], align 4
+; PRED-NEXT:    [[L_B:%.*]] = load i32, ptr [[B]], align 4
+; PRED-NEXT:    [[OR:%.*]] = or i32 [[L_B]], [[L_A]]
+; PRED-NEXT:    [[L_C:%.*]] = load i32, ptr [[C]], align 4
+; PRED-NEXT:    [[C_0:%.*]] = icmp ugt i32 [[L_C]], [[OR]]
+; PRED-NEXT:    br i1 [[C_0]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]]
+; PRED:       if.then:
+; PRED-NEXT:    [[GEP_D:%.*]] = getelementptr i32, ptr [[D]], i64 [[IV]]
+; PRED-NEXT:    store i32 [[OR]], ptr [[E]], align 4
+; PRED-NEXT:    store i32 0, ptr [[GEP_D]], align 4
+; PRED-NEXT:    br label [[LOOP_LATCH]]
+; PRED:       loop.latch:
+; PRED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; PRED-NEXT:    [[C_1:%.*]] = icmp eq i64 [[IV]], [[N]]
+; PRED-NEXT:    br i1 [[C_1]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP20:![0-9]+]]
+; PRED:       exit:
+; PRED-NEXT:    ret i32 0
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %l.A = load i32, ptr %A, align 4
+  %l.B = load i32, ptr %B, align 4
+  %or = or i32 %l.B, %l.A
+  %l.C = load i32, ptr %C, align 4
+  %c.0 = icmp ugt i32 %l.C, %or
+  br i1 %c.0, label %if.then, label %loop.latch
+
+if.then:
+  %gep.D = getelementptr i32, ptr %D, i64 %iv
+  store i32 %or, ptr %E, align 4
+  store i32 0, ptr %gep.D, align 4
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  %c.1 = icmp eq i64 %iv, %N
+  br i1 %c.1, label %exit, label %loop.header
+
+exit:
+  ret i32 0
+}
+
 ;.
 ; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -644,6 +885,21 @@ exit:
 ; DEFAULT: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
 ; DEFAULT: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
 ; DEFAULT: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]}
+; DEFAULT: [[META9]] = !{[[META10:![0-9]+]]}
+; DEFAULT: [[META10]] = distinct !{[[META10]], [[META11:![0-9]+]]}
+; DEFAULT: [[META11]] = distinct !{[[META11]], !"LVerDomain"}
+; DEFAULT: [[META12]] = !{[[META13:![0-9]+]]}
+; DEFAULT: [[META13]] = distinct !{[[META13]], [[META11]]}
+; DEFAULT: [[META14]] = !{[[META15:![0-9]+]]}
+; DEFAULT: [[META15]] = distinct !{[[META15]], [[META11]]}
+; DEFAULT: [[META16]] = !{[[META17:![0-9]+]]}
+; DEFAULT: [[META17]] = distinct !{[[META17]], [[META11]]}
+; DEFAULT: [[META18]] = !{[[META19:![0-9]+]], [[META10]], [[META13]], [[META15]]}
+; DEFAULT: [[META19]] = distinct !{[[META19]], [[META11]]}
+; DEFAULT: [[META20]] = !{[[META19]]}
+; DEFAULT: [[META21]] = !{[[META10]], [[META13]], [[META15]]}
+; DEFAULT: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]], [[META2]]}
+; DEFAULT: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]]}
 ;.
 ; PRED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; PRED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -651,4 +907,19 @@ exit:
 ; PRED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
 ; PRED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
 ; PRED: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; PRED: [[META6]] = !{[[META7:![0-9]+]]}
+; PRED: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]]}
+; PRED: [[META8]] = distinct !{[[META8]], !"LVerDomain"}
+; PRED: [[META9]] = !{[[META10:![0-9]+]]}
+; PRED: [[META10]] = distinct !{[[META10]], [[META8]]}
+; PRED: [[META11]] = !{[[META12:![0-9]+]]}
+; PRED: [[META12]] = distinct !{[[META12]], [[META8]]}
+; PRED: [[META13]] = !{[[META14:![0-9]+]]}
+; PRED: [[META14]] = distinct !{[[META14]], [[META8]]}
+; PRED: [[META15]] = !{[[META16:![0-9]+]], [[META7]], [[META10]], [[META12]]}
+; PRED: [[META16]] = distinct !{[[META16]], [[META8]]}
+; PRED: [[META17]] = !{[[META16]]}
+; PRED: [[META18]] = !{[[META7]], [[META10]], [[META12]]}
+; PRED: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]]}
+; PRED: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/streaming-compatible-sve-no-maximize-bandwidth.ll b/llvm/test/Transforms/LoopVectorize/AArch64/streaming-compatible-sve-no-maximize-bandwidth.ll
index b89d09f25896..6b10d4591f41 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/streaming-compatible-sve-no-maximize-bandwidth.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/streaming-compatible-sve-no-maximize-bandwidth.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=loop-vectorize -force-streaming-compatible-sve -enable-fixedwidth-autovec-in-streaming-mode -mattr=+sve -force-target-instruction-cost=1 -scalable-vectorization=off -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=SC_SVE
+; RUN: opt < %s -passes=loop-vectorize -force-streaming-compatible -enable-fixedwidth-autovec-in-streaming-mode -mattr=+sve -force-target-instruction-cost=1 -scalable-vectorization=off -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=SC_SVE
 ; RUN: opt < %s -passes=loop-vectorize -mattr=+sve -force-target-instruction-cost=1 -scalable-vectorization=off -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=NO_SC_SVE
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
index c3e30f1f81f4..e796e40a7591 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
@@ -592,7 +592,41 @@ define dso_local i32 @predicated_test(i32 noundef %0, ptr %glob) #0 {
   ret i32 0
 }
 
+; This has a maximum trip count of 4. The codegen is currently much better with <8 x half> vectorization.
+; CHECK-LABEL: arm_q15_to_f16_remainder
+; CHECK: LV: Selecting VF: 8
+define void @arm_q15_to_f16_remainder(ptr nocapture noundef readonly %pSrc, ptr nocapture noundef writeonly noalias %pDst, i32 noundef %blockSize) #0 {
+entry:
+  %rem = and i32 %blockSize, 3
+  %cmp.not5 = icmp eq i32 %rem, 0
+  br i1 %cmp.not5, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %blkCnt.08 = phi i32 [ %dec, %while.body ], [ %rem, %while.body.preheader ]
+  %pIn.07 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrc, %while.body.preheader ]
+  %pDst.addr.06 = phi ptr [ %incdec.ptr2, %while.body ], [ %pDst, %while.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i8, ptr %pIn.07, i32 2
+  %0 = load i16, ptr %pIn.07, align 2
+  %conv1 = sitofp i16 %0 to half
+  %1 = fmul fast half %conv1, 0xH0200
+  %incdec.ptr2 = getelementptr inbounds i8, ptr %pDst.addr.06, i32 2
+  store half %1, ptr %pDst.addr.06, align 2
+  %dec = add nsw i32 %blkCnt.08, -1
+  %cmp.not = icmp eq i32 %dec, 0
+  br i1 %cmp.not, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  ret void
+}
+
+
 declare void @llvm.lifetime.start.p0(i64, ptr)
 declare void @llvm.lifetime.end.p0(i64, ptr)
 
-attributes #0 = { "target-features"="+mve" }
+attributes #0 = { "target-features"="+mve.fp" }
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll
index b88254e7b678..786197bfdb90 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization-profitability.ll
@@ -10,7 +10,7 @@ target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux-gnu"
 
 ; Do not vectorize epilogues for loops with minsize attribute
-; CHECK-LABLE: @f1
+; CHECK-LABEL: @f1
 ; CHECK-NOT: vector.main.loop.iter.check
 ; CHECK-NOT: vec.epilog.iter.check
 ; CHECK-NOT: vec.epilog.ph
@@ -48,7 +48,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 }
 
 ; Do not vectorize epilogues for loops with optsize attribute
-; CHECK-LABLE: @f2
+; CHECK-LABEL: @f2
 ; CHECK-NOT: vector.main.loop.iter.check
 ; CHECK-NOT: vec.epilog.iter.check
 ; CHECK-NOT: vec.epilog.ph
@@ -86,7 +86,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 }
 
 ; Do not vectorize the epilogue for loops with VF less than the default -epilogue-vectorization-minimum-VF of 16.
-; CHECK-MIN-D-LABLE: @f3
+; CHECK-MIN-D-LABEL: @f3
 ; CHECK-MIN-D-NOT: vector.main.loop.iter.check
 ; CHECK-MIN-D-NOT: vec.epilog.iter.check
 ; CHECK-MIN-D-NOT: vec.epilog.ph
@@ -96,7 +96,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 
 ; Specify a smaller minimum VF (via `-epilogue-vectorization-minimum-VF=4`) and
 ; make sure the epilogue gets vectorized in that case.
-; CHECK-MIN-D-LABLE: @f3
+; CHECK-MIN-4-LABEL: @f3
 ; CHECK-MIN-4: vector.main.loop.iter.check
 ; CHECK-MIN-4: vec.epilog.iter.check
 ; CHECK-MIN-4: vec.epilog.ph
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
new file mode 100644
index 000000000000..e40f51fd7bd7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data-with-evl \
+; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+; RUN: -mtriple=riscv64 -mattr=+v -S < %s | FileCheck %s
+
+; Make sure we do not vectorize a loop with a widened int induction.
+define void @test_wide_integer_induction(ptr noalias %a, i64 %N) {
+; CHECK-LABEL: define void @test_wide_integer_induction(
+; CHECK-SAME: ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    store i64 [[IV]], ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
+  store i64 %iv, ptr %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+; Make sure we do not vectorize a loop with a widened ptr induction.
+define void @test_wide_ptr_induction(ptr noalias %a, ptr noalias %b, i64 %N) {
+; CHECK-LABEL: define void @test_wide_ptr_induction(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ADDR:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[VECTOR_BODY]] ], [ [[B]], [[VECTOR_PH]] ]
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[ADDR]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]]
+; CHECK-NEXT:    store ptr [[ADDR]], ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add nuw nsw i64 [[EVL_BASED_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %addr = phi ptr [ %incdec.ptr, %for.body ], [ %b, %entry ]
+  %incdec.ptr = getelementptr inbounds i8, ptr %addr, i64 8
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
+  store ptr %addr, ptr %arrayidx, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
new file mode 100644
index 000000000000..a91f92348ab2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s
+
+define void @test(ptr %p, i64 %a, i8 %b) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[B]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY1:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE8]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 3)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <4 x i32> [[VEC_IND]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shl <4 x i64> [[BROADCAST_SPLAT]], <i64 48, i64 48, i64 48, i64 48>
+; CHECK-NEXT:    [[TMP3:%.*]] = ashr <4 x i64> [[TMP2]], <i64 52, i64 52, i64 52, i64 52>
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT2]] to <4 x i32>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <4 x i32> [[PREDPHI]], <i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i32> [[TMP7]] to <4 x i8>
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[VECTOR_BODY:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i8> [[TMP8]], i32 0
+; CHECK-NEXT:    store i8 [[TMP10]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[VECTOR_BODY]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
+; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; CHECK:       pred.store.if3:
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i8> [[TMP8]], i32 1
+; CHECK-NEXT:    store i8 [[TMP12]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; CHECK:       pred.store.continue4:
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; CHECK:       pred.store.if5:
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i8> [[TMP8]], i32 2
+; CHECK-NEXT:    store i8 [[TMP14]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; CHECK:       pred.store.continue6:
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
+; CHECK-NEXT:    br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.if7:
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i8> [[TMP8]], i32 3
+; CHECK-NEXT:    store i8 [[TMP16]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.continue8:
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY:%.*]] ]
+; CHECK-NEXT:    [[ADD]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP_SLT:%.*]] = icmp slt i32 [[IV]], 2
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[A]], 48
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i64 [[SHL]], 52
+; CHECK-NEXT:    [[TRUNC_I32:%.*]] = trunc i64 [[ASHR]] to i32
+; CHECK-NEXT:    br i1 [[CMP_SLT]], label [[COND_FALSE:%.*]], label [[FOR_BODY]]
+; CHECK:       cond.false:
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[B]] to i32
+; CHECK-NEXT:    br label [[FOR_BODY]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[TRUNC_I32]], [[FOR_COND]] ], [ [[ZEXT]], [[COND_FALSE]] ]
+; CHECK-NEXT:    [[SHL_I32:%.*]] = shl i32 [[COND]], 8
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHL_I32]] to i8
+; CHECK-NEXT:    store i8 [[TRUNC]], ptr [[P]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[IV]], 2
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %iv = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %add = add i32 %iv, 1
+  %cmp.slt = icmp slt i32 %iv, 2
+  %shl = shl i64 %a, 48
+  %ashr = ashr i64 %shl, 52
+  %trunc.i32 = trunc i64 %ashr to i32
+  br i1 %cmp.slt, label %cond.false, label %for.body
+
+cond.false:                                       ; preds = %for.cond
+  %zext = zext i8 %b to i32
+  br label %for.body
+
+for.body:                                         ; preds = %cond.false, %for.cond
+  %cond = phi i32 [ %trunc.i32, %for.cond ], [ %zext, %cond.false ]
+  %shl.i32 = shl i32 %cond, 8
+  %trunc = trunc i32 %shl.i32 to i8
+  store i8 %trunc, ptr %p, align 1
+  %cmp = icmp slt i32 %iv, 2
+  br i1 %cmp, label %for.cond, label %exit
+
+exit:                                             ; preds = %for.body
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll
index ae01bdd37110..a52da79ee396 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-gather-scatter.ll
@@ -12,66 +12,18 @@
 define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %index, i64 %n) {
 ; IF-EVL-LABEL: @gather_scatter(
 ; IF-EVL-NEXT:  entry:
-; IF-EVL-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N:%.*]]
-; IF-EVL-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 2
-; IF-EVL-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
-; IF-EVL-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; IF-EVL:       vector.ph:
-; IF-EVL-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; IF-EVL-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
-; IF-EVL-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], 1
-; IF-EVL-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]]
-; IF-EVL-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
-; IF-EVL-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; IF-EVL-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-; IF-EVL-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP11]], zeroinitializer
-; IF-EVL-NEXT:    [[TMP13:%.*]] = mul <vscale x 2 x i64> [[TMP12]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; IF-EVL-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP13]]
-; IF-EVL-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; IF-EVL-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
-; IF-EVL-NEXT:    [[TMP16:%.*]] = mul i64 1, [[TMP15]]
-; IF-EVL-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP16]], i64 0
-; IF-EVL-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; IF-EVL-NEXT:    br label [[VECTOR_BODY:%.*]]
-; IF-EVL:       vector.body:
-; IF-EVL-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IF-EVL-NEXT:    [[TMP17:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[TMP18:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP17]], i32 2, i1 true)
-; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], <vscale x 2 x i64> [[VEC_IND]]
-; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.vp.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP20]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP18]])
-; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]]
-; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x float> @llvm.vp.gather.nxv2f32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP21]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP18]])
-; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], <vscale x 2 x i64> [[WIDE_MASKED_GATHER]]
-; IF-EVL-NEXT:    call void @llvm.vp.scatter.nxv2f32.nxv2p0(<vscale x 2 x float> [[WIDE_MASKED_GATHER2]], <vscale x 2 x ptr> align 4 [[TMP22]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), i32 [[TMP18]])
-; IF-EVL-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP18]] to i64
-; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]]
-; IF-EVL-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX1]], [[TMP10]]
-; IF-EVL-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; IF-EVL-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; IF-EVL:       middle.block:
-; IF-EVL-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; IF-EVL:       scalar.ph:
-; IF-EVL-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; IF-EVL-NEXT:    br label [[FOR_BODY:%.*]]
 ; IF-EVL:       for.body:
-; IF-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; IF-EVL-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX]], i64 [[INDVARS_IV]]
-; IF-EVL-NEXT:    [[TMP25:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
-; IF-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[TMP25]]
-; IF-EVL-NEXT:    [[TMP26:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
-; IF-EVL-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[TMP25]]
-; IF-EVL-NEXT:    store float [[TMP26]], ptr [[ARRAYIDX7]], align 4
+; IF-EVL-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; IF-EVL-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], i64 [[INDVARS_IV]]
+; IF-EVL-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ARRAYIDX3]], align 8
+; IF-EVL-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[IN:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; IF-EVL-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[OUT:%.*]], i64 [[TMP0]]
+; IF-EVL-NEXT:    store float [[TMP1]], ptr [[ARRAYIDX7]], align 4
 ; IF-EVL-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; IF-EVL-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N:%.*]]
+; IF-EVL-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; IF-EVL:       for.end:
 ; IF-EVL-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
new file mode 100644
index 000000000000..07a1cca1bc21
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -mtriple=s390x -mcpu=z14 -S %s | FileCheck %s
+
+define void @test(ptr %p, i40 %a) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[P:%.*]], i40 [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i40> poison, i40 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i40> [[BROADCAST_SPLATINSERT1]], <16 x i40> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE32:%.*]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <16 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule <16 x i32> [[VEC_IV]], <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <16 x i40> [[BROADCAST_SPLAT2]], <i40 24, i40 24, i40 24, i40 24, i40 24, i40 24, i40 24, i40 24, i40 24, i40 24, i40 24, i40 24, i40 24, i40 24, i40 24, i40 24>
+; CHECK-NEXT:    [[TMP2:%.*]] = ashr <16 x i40> [[TMP1]], <i40 28, i40 28, i40 28, i40 28, i40 28, i40 28, i40 28, i40 28, i40 28, i40 28, i40 28, i40 28, i40 28, i40 28, i40 28, i40 28>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <16 x i40> [[TMP2]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <16 x i32> [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <16 x i1> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult <16 x i1> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <16 x i1> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i1> [[TMP0]], i32 0
+; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x i1> [[TMP8]], i32 0
+; CHECK-NEXT:    store i1 [[TMP10]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i1> [[TMP0]], i32 1
+; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; CHECK:       pred.store.if3:
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i1> [[TMP8]], i32 1
+; CHECK-NEXT:    store i1 [[TMP12]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; CHECK:       pred.store.continue4:
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i1> [[TMP0]], i32 2
+; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; CHECK:       pred.store.if5:
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i1> [[TMP8]], i32 2
+; CHECK-NEXT:    store i1 [[TMP14]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; CHECK:       pred.store.continue6:
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x i1> [[TMP0]], i32 3
+; CHECK-NEXT:    br i1 [[TMP15]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; CHECK:       pred.store.if7:
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i1> [[TMP8]], i32 3
+; CHECK-NEXT:    store i1 [[TMP16]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.continue8:
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP0]], i32 4
+; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; CHECK:       pred.store.if9:
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i1> [[TMP8]], i32 4
+; CHECK-NEXT:    store i1 [[TMP18]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; CHECK:       pred.store.continue10:
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x i1> [[TMP0]], i32 5
+; CHECK-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; CHECK:       pred.store.if11:
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <16 x i1> [[TMP8]], i32 5
+; CHECK-NEXT:    store i1 [[TMP20]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
+; CHECK:       pred.store.continue12:
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <16 x i1> [[TMP0]], i32 6
+; CHECK-NEXT:    br i1 [[TMP21]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
+; CHECK:       pred.store.if13:
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <16 x i1> [[TMP8]], i32 6
+; CHECK-NEXT:    store i1 [[TMP22]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
+; CHECK:       pred.store.continue14:
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <16 x i1> [[TMP0]], i32 7
+; CHECK-NEXT:    br i1 [[TMP23]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16:%.*]]
+; CHECK:       pred.store.if15:
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <16 x i1> [[TMP8]], i32 7
+; CHECK-NEXT:    store i1 [[TMP24]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE16]]
+; CHECK:       pred.store.continue16:
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <16 x i1> [[TMP0]], i32 8
+; CHECK-NEXT:    br i1 [[TMP25]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
+; CHECK:       pred.store.if17:
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <16 x i1> [[TMP8]], i32 8
+; CHECK-NEXT:    store i1 [[TMP26]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE18]]
+; CHECK:       pred.store.continue18:
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <16 x i1> [[TMP0]], i32 9
+; CHECK-NEXT:    br i1 [[TMP27]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
+; CHECK:       pred.store.if19:
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <16 x i1> [[TMP8]], i32 9
+; CHECK-NEXT:    store i1 [[TMP28]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE20]]
+; CHECK:       pred.store.continue20:
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <16 x i1> [[TMP0]], i32 10
+; CHECK-NEXT:    br i1 [[TMP29]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]]
+; CHECK:       pred.store.if21:
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <16 x i1> [[TMP8]], i32 10
+; CHECK-NEXT:    store i1 [[TMP30]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
+; CHECK:       pred.store.continue22:
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <16 x i1> [[TMP0]], i32 11
+; CHECK-NEXT:    br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
+; CHECK:       pred.store.if23:
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <16 x i1> [[TMP8]], i32 11
+; CHECK-NEXT:    store i1 [[TMP32]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE24]]
+; CHECK:       pred.store.continue24:
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <16 x i1> [[TMP0]], i32 12
+; CHECK-NEXT:    br i1 [[TMP33]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
+; CHECK:       pred.store.if25:
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <16 x i1> [[TMP8]], i32 12
+; CHECK-NEXT:    store i1 [[TMP34]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE26]]
+; CHECK:       pred.store.continue26:
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <16 x i1> [[TMP0]], i32 13
+; CHECK-NEXT:    br i1 [[TMP35]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
+; CHECK:       pred.store.if27:
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <16 x i1> [[TMP8]], i32 13
+; CHECK-NEXT:    store i1 [[TMP36]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE28]]
+; CHECK:       pred.store.continue28:
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <16 x i1> [[TMP0]], i32 14
+; CHECK-NEXT:    br i1 [[TMP37]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
+; CHECK:       pred.store.if29:
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <16 x i1> [[TMP8]], i32 14
+; CHECK-NEXT:    store i1 [[TMP38]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE30]]
+; CHECK:       pred.store.continue30:
+; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <16 x i1> [[TMP0]], i32 15
+; CHECK-NEXT:    br i1 [[TMP39]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32]]
+; CHECK:       pred.store.if31:
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <16 x i1> [[TMP8]], i32 15
+; CHECK-NEXT:    store i1 [[TMP40]], ptr [[P]], align 1
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE32]]
+; CHECK:       pred.store.continue32:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i40 [[A]], 24
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i40 [[SHL]], 28
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i40 [[ASHR]] to i32
+; CHECK-NEXT:    [[ICMP_EQ:%.*]] = icmp eq i32 [[TRUNC]], 0
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[ICMP_EQ]] to i32
+; CHECK-NEXT:    [[ICMP_ULT:%.*]] = icmp ult i32 0, [[ZEXT]]
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[ICMP_ULT]], true
+; CHECK-NEXT:    [[ICMP_SGT:%.*]] = icmp sgt i1 [[OR]], false
+; CHECK-NEXT:    store i1 [[ICMP_SGT]], ptr [[P]], align 1
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[IV_NEXT]], 10
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %for.body ]
+  %shl = shl i40 %a, 24
+  %ashr = ashr i40 %shl, 28
+  %trunc = trunc i40 %ashr to i32
+  %icmp.eq = icmp eq i32 %trunc, 0
+  %zext = zext i1 %icmp.eq to i32
+  %icmp.ult = icmp ult i32 0, %zext
+  %or = or i1 %icmp.ult, true
+  %icmp.sgt = icmp sgt i1 %or, false
+  store i1 %icmp.sgt, ptr %p, align 1
+  %iv.next = add i32 %iv, 1
+  %cond = icmp ult i32 %iv.next, 10
+  br i1 %cond, label %for.body, label %exit
+
+exit:                                             ; preds = %for.body
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
index 0b16d80a4adb..3d7153e66fc6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
@@ -88,7 +88,7 @@ loopexit:
   ret void
 }
 
-attributes #0 = { uwtable "target-cpu"="skylake" "target-features"="+sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,+xsavec,+popcnt,+aes,-avx512bitalg,+xsaves,-avx512er,-avx512vnni,-avx512vpopcntdq,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-sse4a,-avx512bw,+clflushopt,+xsave,-avx512vbmi2,-avx512vl,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,+sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3" }
+attributes #0 = { uwtable "target-cpu"="skylake" "target-features"="+sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,+xsavec,+popcnt,+aes,-avx512bitalg,+xsaves,-avx512vnni,-avx512vpopcntdq,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-sse4a,-avx512bw,+clflushopt,+xsave,-avx512vbmi2,-avx512vl,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,+f16c,+ssse3,+sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3" }
 
 !0 = !{i32 0, i32 2147483646}
 !1 = !{}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
index 5c9fe54b5521..743ca20f92b4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll
@@ -118,7 +118,7 @@ L44:                                              ; preds = %L26
   ret ptr addrspace(10) null
 }
 
-attributes #0 = { "target-cpu"="skylake-avx512" "target-features"="+xsaves,+xsavec,+prfchw,+lzcnt,+sahf,+pku,+avx512vl,+avx512bw,+avx512cd,+clwb,+clflushopt,+adx,+avx512dq,+avx512f,+bmi2,+avx2,+bmi,+fsgsbase,+f16c,+avx,+xsave,+aes,+popcnt,+movbe,+sse4.2,+sse4.1,+cx16,+fma,+ssse3,+pclmul,+sse3,-rdrnd,-rtm,-rdseed,-avx512ifma,-avx512pf,-avx512er,-sha,-prefetchwt1,-avx512vbmi,-waitpkg,-avx512vbmi2,-shstk,-gfni,-vaes,-vpclmulqdq,-avx512vnni,-avx512bitalg,-avx512vpopcntdq,-rdpid,-cldemote,-movdiri,-movdir64b,-enqcmd,-avx512vp2intersect,-serialize,-tsxldtrk,-pconfig,-amx-bf16,-amx-tile,-amx-int8,-sse4a,-xop,-lwp,-fma4,-tbm,-mwaitx,-xsaveopt,-clzero,-wbnoinvd,-avx512bf16,-ptwrite,+sse2,+mmx,+fxsr,+64bit,+cx8" }
+attributes #0 = { "target-cpu"="skylake-avx512" "target-features"="+xsaves,+xsavec,+prfchw,+lzcnt,+sahf,+pku,+avx512vl,+avx512bw,+avx512cd,+clwb,+clflushopt,+adx,+avx512dq,+avx512f,+bmi2,+avx2,+bmi,+fsgsbase,+f16c,+avx,+xsave,+aes,+popcnt,+movbe,+sse4.2,+sse4.1,+cx16,+fma,+ssse3,+pclmul,+sse3,-rdrnd,-rtm,-rdseed,-avx512ifma,-avx512pf,-sha,-avx512vbmi,-waitpkg,-avx512vbmi2,-shstk,-gfni,-vaes,-vpclmulqdq,-avx512vnni,-avx512bitalg,-avx512vpopcntdq,-rdpid,-cldemote,-movdiri,-movdir64b,-enqcmd,-avx512vp2intersect,-serialize,-tsxldtrk,-pconfig,-amx-bf16,-amx-tile,-amx-int8,-sse4a,-xop,-lwp,-fma4,-tbm,-mwaitx,-xsaveopt,-clzero,-wbnoinvd,-avx512bf16,-ptwrite,+sse2,+mmx,+fxsr,+64bit,+cx8" }
 attributes #1 = { inaccessiblemem_or_argmemonly }
 attributes #2 = { allocsize(1) }
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index bf2b9e2aef85..ce460f4fe354 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -111,4 +111,4 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit99
 }
 
-attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="knl" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-nonunique.ll b/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-nonunique.ll
index b49c9a139cfc..985c381ad42f 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-nonunique.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/tailcall-nonunique.ll
@@ -9,10 +9,7 @@
 ; RUN:  -stats -debug %s -S 2>&1 | FileCheck %s --check-prefix=STATS \
 ; RUN:  --check-prefix=IR --check-prefix=DEBUG
 
-; DEBUG: Not found through unique tail call chain: _Z3barv from main that actually called _Z3foob (found multiple possible chains)
-; DEBUG: Not found through unique tail call chain: _Z3barv from main that actually called _Z3foob (found multiple possible chains)
-; DEBUG: Not found through unique tail call chain: _Z3barv from main that actually called _Z3foob (found multiple possible chains)
-; DEBUG: Not found through unique tail call chain: _Z3barv from main that actually called _Z3foob (found multiple possible chains)
+; DEBUG: Not found through unique tail call chain: _Z3barv from main that actually called xyz (found multiple possible chains)
 
 ;; Check that all calls in the IR are to the original functions, leading to a
 ;; non-cold operator new call.
@@ -91,39 +88,37 @@ return:                                           ; preds = %if.else, %if.then
 }
 
 ; Function Attrs: noinline
-; IR-LABEL: @main()
-define dso_local i32 @main() local_unnamed_addr #0 {
-delete.end13:
+; IR-LABEL: @xyz()
+define dso_local i32 @xyz() local_unnamed_addr #0 {
   ; IR: call ptr @_Z3foob(i1 true)
-  %call = tail call ptr @_Z3foob(i1 true), !callsite !10
+  %call = tail call ptr @_Z3foob(i1 true)
   ; IR: call ptr @_Z3foob(i1 true)
-  %call1 = tail call ptr @_Z3foob(i1 true), !callsite !11
+  %call1 = tail call ptr @_Z3foob(i1 true)
   ; IR: call ptr @_Z3foob(i1 false)
-  %call2 = tail call ptr @_Z3foob(i1 false), !callsite !12
+  %call2 = tail call ptr @_Z3foob(i1 false)
   ; IR: call ptr @_Z3foob(i1 false)
-  %call3 = tail call ptr @_Z3foob(i1 false), !callsite !13
+  %call3 = tail call ptr @_Z3foob(i1 false)
+  ret i32 0
+}
+
+define dso_local i32 @main() local_unnamed_addr #0 {
+  ; IR: call i32 @xyz()
+  %call1 = tail call i32 @xyz(), !callsite !11
   ret i32 0
 }
 
 ; IR: attributes #[[NOTCOLD]] = { builtin allocsize(0) "memprof"="notcold" }
 
-; STATS: 4 memprof-context-disambiguation - Number of profiled callees found via multiple tail call chains
+; STATS: 1 memprof-context-disambiguation - Number of profiled callees found via multiple tail call chains
 
 attributes #0 = { noinline }
 attributes #1 = { nobuiltin allocsize(0) }
 attributes #2 = { builtin allocsize(0) }
 
-!0 = !{!1, !3, !5, !7}
-!1 = !{!2, !"notcold"}
-!2 = !{i64 3186456655321080972, i64 6307901912192269588}
-!3 = !{!4, !"cold"}
-!4 = !{i64 3186456655321080972, i64 6792096022461663180}
+!0 = !{!5, !7}
 !5 = !{!6, !"notcold"}
 !6 = !{i64 3186456655321080972, i64 8632435727821051414}
 !7 = !{!8, !"cold"}
 !8 = !{i64 3186456655321080972, i64 -3421689549917153178}
 !9 = !{i64 3186456655321080972}
-!10 = !{i64 8632435727821051414}
 !11 = !{i64 -3421689549917153178}
-!12 = !{i64 6307901912192269588}
-!13 = !{i64 6792096022461663180}
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
index 34a68a3020e5..e6ddf16f0676 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
@@ -8,7 +8,7 @@
 ;; void p1(void);
 ;; int unknown(void);
 ;; void unknown_pure(void) __attribute__((pure));
-;; void unknown_no_openmp(void) __attribute__((assume("omp_no_openmp")));
+;; [[omp::assume("omp_no_openmp")]] void unknown_no_openmp(void);
 ;;
 ;; int G;
 ;; void no_parallel_region_in_here(void) {
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
index 85d495f45039..d20821d45036 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
@@ -10,7 +10,7 @@
 ;; void p1(void);
 ;; int unknown(void);
 ;; void unknown_pure(void) __attribute__((pure));
-;; void unknown_no_openmp(void) __attribute__((assume("omp_no_openmp")));
+;; [[omp::assume("omp_no_openmp")]] void unknown_no_openmp(void);
 ;;
 ;; int G;
 ;; void no_parallel_region_in_here(void) {
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
index f8c4e6b113c9..f7bfd3065069 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
@@ -1,10 +1,10 @@
 ; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -pass-remarks-missed=openmp-opt -pass-remarks-analysis=openmp-opt -disable-output < %s 2>&1 | FileCheck %s
 target triple = "nvptx64"
 
-; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:11:1: Generic-mode kernel is executed with a customized state machine that requires a fallback.
-; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:13:5: Call may contain unknown parallel regions. Use `__attribute__((assume("omp_no_parallelism")))` to override.
-; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:15:5: Call may contain unknown parallel regions. Use `__attribute__((assume("omp_no_parallelism")))` to override.
-; CHECK: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:20:1: Rewriting generic-mode kernel with a customized state machine.
+; CHECK{LITERAL}: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:11:1: Generic-mode kernel is executed with a customized state machine that requires a fallback.
+; CHECK{LITERAL}: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:13:5: Call may contain unknown parallel regions. Use `[[omp::assume("omp_no_parallelism")]]` to override.
+; CHECK{LITERAL}: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:15:5: Call may contain unknown parallel regions. Use `[[omp::assume("omp_no_parallelism")]]` to override.
+; CHECK{LITERAL}: remark: llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c:20:1: Rewriting generic-mode kernel with a customized state machine.
 
 
 ;; void unknown(void);
@@ -24,7 +24,7 @@ target triple = "nvptx64"
 ;;   }
 ;; }
 ;;
-;; void no_openmp(void) __attribute__((assume("omp_no_openmp")));
+;; [[omp::assume("omp_no_openmp")]] void no_openmp(void);
 ;; void test_no_fallback(void) {
 ;;   #pragma omp target teams
 ;;   {
diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll
index 159280ae62a0..393968913855 100644
--- a/llvm/test/Transforms/OpenMP/spmdization.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization.ll
@@ -7,7 +7,7 @@
 ; RUN: opt --mtriple=nvptx64-- -S -passes=openmp-opt-postlink < %s | FileCheck %s --check-prefix=NVPTX-DISABLED2
 
 ;; void unknown(void);
-;; void spmd_amenable(void) __attribute__((assume("ompx_spmd_amenable")));
+;; [[omp::assume("ompx_spmd_amenable")]] void spmd_amenable(void);
 ;;
 ;; void sequential_loop() {
 ;;   #pragma omp target teams
@@ -22,7 +22,7 @@
 ;;   }
 ;; }
 ;;
-;; void use(__attribute__((noescape)) int *) __attribute__((assume("ompx_spmd_amenable")));
+;; [[omp::assume("ompx_spmd_amenable")]] void use(__attribute__((noescape)) int *);
 ;;
 ;; void sequential_loop_to_stack_var() {
 ;;   #pragma omp target teams
diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
index b2e14dce94d5..bd128b7f74d7 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
@@ -2,8 +2,8 @@
 ; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s
 ; RUN: opt -S -passes=openmp-opt -openmp-opt-disable-spmdization < %s | FileCheck %s --check-prefix=CHECK-DISABLED
 ;
-;    void pure(void) __attribute__((pure, assume("ompx_spmd_amenable")));
-;    int no_openmp(int *) __attribute__((assume("omp_no_openmp","ompx_spmd_amenable")));
+;    [[omp::assume("ompx_spmd_amenable")]] void pure(void) __attribute__((pure));
+;    [[omp::assume("omp_no_openmp","ompx_spmd_amenable")]] int no_openmp(int *);
 ;
 ;    void sequential_loop(int *x, int N) {
 ;    #pragma omp target teams
diff --git a/llvm/test/Transforms/OpenMP/spmdization_remarks.ll b/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
index 28df2f524913..f5a4cea9a841 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
@@ -1,12 +1,12 @@
 ; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -pass-remarks-missed=openmp-opt -pass-remarks-analysis=openmp-opt -disable-output < %s 2>&1 | FileCheck %s
 target triple = "nvptx64"
 
-; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:13:5: Value has potential side effects preventing SPMD-mode execution. Add `__attribute__((assume("ompx_spmd_amenable")))` to the called function to override.
-; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:15:5: Value has potential side effects preventing SPMD-mode execution. Add `__attribute__((assume("ompx_spmd_amenable")))` to the called function to override.
-; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:11:1: Generic-mode kernel is executed with a customized state machine that requires a fallback.
-; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:13:5: Call may contain unknown parallel regions. Use `__attribute__((assume("omp_no_parallelism")))` to override.
-; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:15:5: Call may contain unknown parallel regions. Use `__attribute__((assume("omp_no_parallelism")))` to override.
-; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:20:1: Transformed generic-mode kernel to SPMD-mode.
+; CHECK{LITERAL}: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:13:5: Value has potential side effects preventing SPMD-mode execution. Add `[[omp::assume("ompx_spmd_amenable")]]` to the called function to override.
+; CHECK{LITERAL}: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:15:5: Value has potential side effects preventing SPMD-mode execution. Add `[[omp::assume("ompx_spmd_amenable")]]` to the called function to override.
+; CHECK{LITERAL}: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:11:1: Generic-mode kernel is executed with a customized state machine that requires a fallback.
+; CHECK{LITERAL}: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:13:5: Call may contain unknown parallel regions. Use `[[omp::assume("omp_no_parallelism")]]` to override.
+; CHECK{LITERAL}: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:15:5: Call may contain unknown parallel regions. Use `[[omp::assume("omp_no_parallelism")]]` to override.
+; CHECK{LITERAL}: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:20:1: Transformed generic-mode kernel to SPMD-mode.
 
 
 ;; void unknown(void);
@@ -26,7 +26,7 @@ target triple = "nvptx64"
 ;;   }
 ;; }
 ;;
-;; void no_openmp(void) __attribute__((assume("omp_no_openmp")));
+;; void no_openmp(void) [[omp::assume("omp_no_openmp")]];
 ;; void test_no_fallback(void) {
 ;;   #pragma omp target teams
 ;;   {
diff --git a/llvm/test/Transforms/SCCP/ip-add-range-to-call.ll b/llvm/test/Transforms/SCCP/ip-add-range-to-call.ll
index c24c554102dd..91efbcc4ee38 100644
--- a/llvm/test/Transforms/SCCP/ip-add-range-to-call.ll
+++ b/llvm/test/Transforms/SCCP/ip-add-range-to-call.ll
@@ -159,7 +159,7 @@ exit:
 }
 
 define i32 @caller5() {
-; CHECK-LABEL: define range(i32 200, 401) i32 @caller5() {
+; CHECK-LABEL: define i32 @caller5() {
 ; CHECK-NEXT:    [[C1:%.*]] = call i32 @callee5(i32 10, i32 100)
 ; CHECK-NEXT:    [[C2:%.*]] = call i32 @callee5(i32 20, i32 200)
 ; CHECK-NEXT:    [[A:%.*]] = add i32 [[C1]], [[C2]]
diff --git a/llvm/test/Transforms/SCCP/range-mul-nuw-nsw-flags.ll b/llvm/test/Transforms/SCCP/range-mul-nuw-nsw-flags.ll
new file mode 100644
index 000000000000..8525264a0087
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/range-mul-nuw-nsw-flags.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=ipsccp -S %s | FileCheck %s
+
+define i1 @range_from_mul_nuw_nsw(i32 %a) {
+; CHECK-LABEL: @range_from_mul_nuw_nsw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[A]], 10000
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[MUL]], -5000
+; CHECK-NEXT:    ret i1 false
+; CHECK:       else:
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %cmp = icmp ne i32 %a, 0
+  br i1 %cmp, label %then, label %else
+then:
+  %mul = mul nuw nsw i32 %a, 10000 ; Refined range via mul_nuw: [10000, 0)
+  %add = add nsw i32 %mul, -5000   ; Range: [5000, UINT_MAX - 5000 + 1)
+  %cond = icmp ult i32 %add, 4999
+  ret i1 %cond
+else:
+  ret i1 0
+}
diff --git a/llvm/test/Transforms/SCCP/range-with-undef.ll b/llvm/test/Transforms/SCCP/range-with-undef.ll
new file mode 100644
index 000000000000..9b8d41517114
--- /dev/null
+++ b/llvm/test/Transforms/SCCP/range-with-undef.ll
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=ipsccp < %s | FileCheck %s
+
+; Make sure that constant ranges including undef are propagated correctly.
+
+define i8 @test_binop(i1 %cond, i8 %a) {
+; CHECK-LABEL: define i8 @test_binop(
+; CHECK-SAME: i1 [[COND:%.*]], i8 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF:.*]], label %[[JOIN:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    [[A_EXT:%.*]] = zext i8 [[A]] to i16
+; CHECK-NEXT:    br label %[[JOIN]]
+; CHECK:       [[JOIN]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i16 [ undef, %[[ENTRY]] ], [ [[A_EXT]], %[[IF]] ]
+; CHECK-NEXT:    [[AND:%.*]] = and i16 [[PHI]], -1
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i16 [[AND]] to i8
+; CHECK-NEXT:    ret i8 [[TRUNC]]
+;
+entry:
+  br i1 %cond, label %if, label %join
+
+if:
+  %a.ext = zext i8 %a to i16
+  br label %join
+
+join:
+  %phi = phi i16 [ undef, %entry ], [ %a.ext, %if ]
+  %and = and i16 %phi, u0x0000ffff
+  %trunc = trunc i16 %and to i8
+  ret i8 %trunc
+}
+
+define i8 @test_cast(i1 %cond, i8 %a) {
+; CHECK-LABEL: define i8 @test_cast(
+; CHECK-SAME: i1 [[COND:%.*]], i8 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF:.*]], label %[[JOIN:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    [[A_EXT:%.*]] = zext i8 [[A]] to i16
+; CHECK-NEXT:    br label %[[JOIN]]
+; CHECK:       [[JOIN]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i16 [ undef, %[[ENTRY]] ], [ [[A_EXT]], %[[IF]] ]
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i16 [[PHI]] to i32
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[ZEXT]] to i8
+; CHECK-NEXT:    ret i8 [[TRUNC]]
+;
+entry:
+  br i1 %cond, label %if, label %join
+
+if:
+  %a.ext = zext i8 %a to i16
+  br label %join
+
+join:
+  %phi = phi i16 [ undef, %entry ], [ %a.ext, %if ]
+  %zext = zext i16 %phi to i32
+  %trunc = trunc i32 %zext to i8
+  ret i8 %trunc
+}
+
+define i8 @test_intrin(i1 %cond, i8 %a) {
+; CHECK-LABEL: define i8 @test_intrin(
+; CHECK-SAME: i1 [[COND:%.*]], i8 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF:.*]], label %[[JOIN:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    [[A_EXT:%.*]] = zext i8 [[A]] to i16
+; CHECK-NEXT:    br label %[[JOIN]]
+; CHECK:       [[JOIN]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i16 [ undef, %[[ENTRY]] ], [ [[A_EXT]], %[[IF]] ]
+; CHECK-NEXT:    [[UMAX:%.*]] = call i16 @llvm.umax.i16(i16 [[PHI]], i16 42)
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i16 [[UMAX]] to i8
+; CHECK-NEXT:    ret i8 [[TRUNC]]
+;
+entry:
+  br i1 %cond, label %if, label %join
+
+if:
+  %a.ext = zext i8 %a to i16
+  br label %join
+
+join:
+  %phi = phi i16 [ undef, %entry ], [ %a.ext, %if ]
+  %umax = call i16 @llvm.umax(i16 %phi, i16 42)
+  %trunc = trunc i16 %umax to i8
+  ret i8 %trunc
+}
+
+define i9 @test_with_overflow(i1 %cond, i8 %a) {
+; CHECK-LABEL: define i9 @test_with_overflow(
+; CHECK-SAME: i1 [[COND:%.*]], i8 [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF:.*]], label %[[JOIN:.*]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    [[A_EXT:%.*]] = zext i8 [[A]] to i16
+; CHECK-NEXT:    br label %[[JOIN]]
+; CHECK:       [[JOIN]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i16 [ undef, %[[ENTRY]] ], [ [[A_EXT]], %[[IF]] ]
+; CHECK-NEXT:    [[WO:%.*]] = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 [[PHI]], i16 1)
+; CHECK-NEXT:    [[ADD:%.*]] = extractvalue { i16, i1 } [[WO]], 0
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i16 [[ADD]] to i9
+; CHECK-NEXT:    ret i9 [[TRUNC]]
+;
+entry:
+  br i1 %cond, label %if, label %join
+
+if:
+  %a.ext = zext i8 %a to i16
+  br label %join
+
+join:
+  %phi = phi i16 [ undef, %entry ], [ %a.ext, %if ]
+  %wo = call {i16, i1} @llvm.uadd.with.overflow(i16 %phi, i16 1)
+  %add = extractvalue {i16, i1} %wo, 0
+  %trunc = trunc i16 %add to i9
+  ret i9 %trunc
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
index 290560151b79..3749bdf1bba3 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s
 
 define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-LABEL: @uadd_sat_v2i16(
@@ -21,6 +21,11 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
 ; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
 ;
+; GFX9-LABEL: @uadd_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
 bb:
   %arg0.0 = extractelement <2 x i16> %arg0, i64 0
   %arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -51,6 +56,11 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
 ; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
 ;
+; GFX9-LABEL: @usub_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
 bb:
   %arg0.0 = extractelement <2 x i16> %arg0, i64 0
   %arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -81,6 +91,11 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
 ; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
 ;
+; GFX9-LABEL: @sadd_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
 bb:
   %arg0.0 = extractelement <2 x i16> %arg0, i64 0
   %arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -111,6 +126,11 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
 ; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
 ;
+; GFX9-LABEL: @ssub_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
 bb:
   %arg0.0 = extractelement <2 x i16> %arg0, i64 0
   %arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -252,6 +272,18 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
 ;
+; GFX9-LABEL: @uadd_sat_v3i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
+; GFX9-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX9-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; GFX9-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
+; GFX9-NEXT:    ret <3 x i16> [[INS_2]]
+;
 bb:
   %arg0.0 = extractelement <3 x i16> %arg0, i64 0
   %arg0.1 = extractelement <3 x i16> %arg0, i64 1
@@ -291,19 +323,25 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 2
-; GFX8-NEXT:    [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3
-; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 2
-; GFX8-NEXT:    [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
 ; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX8-NEXT:    [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]])
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
-; GFX8-NEXT:    [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3
-; GFX8-NEXT:    ret <4 x i16> [[INS_3]]
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
+; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
+;
+; GFX9-LABEL: @uadd_sat_v4i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
+; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX9-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 bb:
   %arg0.0 = extractelement <4 x i16> %arg0, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
index 2038400a0586..0bb641371825 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX7 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer,instcombine %s | FileCheck -check-prefixes=GCN,GFX9 %s
 
 define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX7-LABEL: @uadd_sat_v2i16(
@@ -21,6 +21,11 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
 ; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
 ;
+; GFX9-LABEL: @uadd_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
 bb:
   %arg0.0 = extractelement <2 x i16> %arg0, i64 0
   %arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -51,6 +56,11 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
 ; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
 ;
+; GFX9-LABEL: @usub_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
 bb:
   %arg0.0 = extractelement <2 x i16> %arg0, i64 0
   %arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -81,6 +91,11 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
 ; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
 ;
+; GFX9-LABEL: @sadd_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
 bb:
   %arg0.0 = extractelement <2 x i16> %arg0, i64 0
   %arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -111,6 +126,11 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
 ; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
 ;
+; GFX9-LABEL: @ssub_sat_v2i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
+; GFX9-NEXT:    ret <2 x i16> [[TMP0]]
+;
 bb:
   %arg0.0 = extractelement <2 x i16> %arg0, i64 0
   %arg0.1 = extractelement <2 x i16> %arg0, i64 1
@@ -252,6 +272,18 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
 ;
+; GFX9-LABEL: @uadd_sat_v3i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
+; GFX9-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX9-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; GFX9-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
+; GFX9-NEXT:    ret <3 x i16> [[INS_2]]
+;
 bb:
   %arg0.0 = extractelement <3 x i16> %arg0, i64 0
   %arg0.1 = extractelement <3 x i16> %arg0, i64 1
@@ -291,19 +323,25 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 2
-; GFX8-NEXT:    [[ARG0_3:%.*]] = extractelement <4 x i16> [[ARG0]], i64 3
-; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 2
-; GFX8-NEXT:    [[ARG1_3:%.*]] = extractelement <4 x i16> [[ARG1]], i64 3
-; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
 ; GFX8-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
-; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX8-NEXT:    [[ADD_3:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_3]], i16 [[ARG1_3]])
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
-; GFX8-NEXT:    [[INS_3:%.*]] = insertelement <4 x i16> [[INS_2]], i16 [[ADD_3]], i64 3
-; GFX8-NEXT:    ret <4 x i16> [[INS_3]]
+; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
+; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
+;
+; GFX9-LABEL: @uadd_sat_v4i16(
+; GFX9-NEXT:  bb:
+; GFX9-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
+; GFX9-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; GFX9-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX9-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
+; GFX9-NEXT:    [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GFX9-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 bb:
   %arg0.0 = extractelement <4 x i16> %arg0, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll
index 0a020c855cc2..e2d25bae95e9 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/crash_extract_subvector_cost.ll
@@ -4,15 +4,10 @@
 define <2 x i16> @uadd_sat_v9i16_combine_vi16(<9 x i16> %arg0, <9 x i16> %arg1) {
 ; CHECK-LABEL: @uadd_sat_v9i16_combine_vi16(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARG0_1:%.*]] = extractelement <9 x i16> undef, i64 7
-; CHECK-NEXT:    [[ARG0_2:%.*]] = extractelement <9 x i16> [[ARG0:%.*]], i64 8
-; CHECK-NEXT:    [[ARG1_1:%.*]] = extractelement <9 x i16> [[ARG1:%.*]], i64 7
-; CHECK-NEXT:    [[ARG1_2:%.*]] = extractelement <9 x i16> [[ARG1]], i64 8
-; CHECK-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
-; CHECK-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> undef, i16 [[ADD_1]], i64 0
-; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <2 x i16> [[INS_1]], i16 [[ADD_2]], i64 1
-; CHECK-NEXT:    ret <2 x i16> [[INS_2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <9 x i16> undef, <9 x i16> [[ARG0:%.*]], <2 x i32> <i32 0, i32 17>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <9 x i16> [[ARG1:%.*]], <9 x i16> poison, <2 x i32> <i32 7, i32 8>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
+; CHECK-NEXT:    ret <2 x i16> [[TMP2]]
 ;
 bb:
   %arg0.1 = extractelement <9 x i16> undef, i64 7
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
index 46980b33e401..3b63c1e35610 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/phi-result-use-order.ll
@@ -4,23 +4,20 @@
 define <4 x half> @phis(i1 %cmp1, <4 x half> %in1, <4 x half> %in2)  {
 ; CHECK-LABEL: @phis(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x half> [[IN1:%.*]], i64 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x half> [[IN1]], i64 3
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[IN1:%.*]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[BB0:%.*]]
 ; CHECK:       bb0:
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x half> [[IN2:%.*]], i64 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x half> [[IN2]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x half> [[IN2:%.*]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    br label [[BB1]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[C2:%.*]] = phi half [ [[A2]], [[ENTRY:%.*]] ], [ [[B2]], [[BB0]] ]
-; CHECK-NEXT:    [[C3:%.*]] = phi half [ [[A3]], [[ENTRY]] ], [ [[B3]], [[BB0]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x half> [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[BB0]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x half> [[TMP2]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[O2:%.*]] = insertelement <4 x half> [[TMP3]], half [[C2]], i64 2
-; CHECK-NEXT:    [[O3:%.*]] = insertelement <4 x half> [[O2]], half [[C3]], i64 3
-; CHECK-NEXT:    ret <4 x half> [[O3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x half> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP2]], [[BB0]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x half> [ [[TMP1]], [[ENTRY]] ], [ [[TMP3]], [[BB0]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x half> [[TMP4]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x half> [[TMP5]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x half> [[TMP4]], <2 x half> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x half> [[TMP8]]
 ;
 entry:
   %a0 = extractelement <4 x half> %in1, i64 0
@@ -52,23 +49,20 @@ bb1:
 define <4 x half> @phis_reverse(i1 %cmp1, <4 x half> %in1, <4 x half> %in2)  {
 ; CHECK-LABEL: @phis_reverse(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x half> [[IN1:%.*]], i64 2
-; CHECK-NEXT:    [[A3:%.*]] = extractelement <4 x half> [[IN1]], i64 3
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x half> [[IN1:%.*]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half> [[IN1]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[BB0:%.*]]
 ; CHECK:       bb0:
-; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x half> [[IN2:%.*]], i64 2
-; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x half> [[IN2]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x half> [[IN2:%.*]], <4 x half> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x half> [[IN2]], <4 x half> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    br label [[BB1]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[C3:%.*]] = phi half [ [[A3]], [[ENTRY:%.*]] ], [ [[B3]], [[BB0]] ]
-; CHECK-NEXT:    [[C2:%.*]] = phi half [ [[A2]], [[ENTRY]] ], [ [[B2]], [[BB0]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x half> [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[BB0]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x half> [[TMP2]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[O2:%.*]] = insertelement <4 x half> [[TMP3]], half [[C2]], i64 2
-; CHECK-NEXT:    [[O3:%.*]] = insertelement <4 x half> [[O2]], half [[C3]], i64 3
-; CHECK-NEXT:    ret <4 x half> [[O3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x half> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP2]], [[BB0]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x half> [ [[TMP1]], [[ENTRY]] ], [ [[TMP3]], [[BB0]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x half> [[TMP5]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x half> [[TMP4]], <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x half> [[TMP8]]
 ;
 entry:
   %a0 = extractelement <4 x half> %in1, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
index b34b9a352536..aceee8840bb4 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll
@@ -3,21 +3,10 @@
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer,dce < %s | FileCheck -check-prefixes=GCN,VI %s
 
 define half @reduction_half4(<4 x half> %a) {
-; GFX9-LABEL: @reduction_half4(
-; GFX9-NEXT:  entry:
-; GFX9-NEXT:    [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH8000, <4 x half> [[A:%.*]])
-; GFX9-NEXT:    ret half [[TMP0]]
-;
-; VI-LABEL: @reduction_half4(
-; VI-NEXT:  entry:
-; VI-NEXT:    [[ELT0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0
-; VI-NEXT:    [[ELT1:%.*]] = extractelement <4 x half> [[A]], i64 1
-; VI-NEXT:    [[ELT2:%.*]] = extractelement <4 x half> [[A]], i64 2
-; VI-NEXT:    [[ELT3:%.*]] = extractelement <4 x half> [[A]], i64 3
-; VI-NEXT:    [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
-; VI-NEXT:    [[ADD2:%.*]] = fadd fast half [[ELT2]], [[ADD1]]
-; VI-NEXT:    [[ADD3:%.*]] = fadd fast half [[ELT3]], [[ADD2]]
-; VI-NEXT:    ret half [[ADD3]]
+; GCN-LABEL: @reduction_half4(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH8000, <4 x half> [[A:%.*]])
+; GCN-NEXT:    ret half [[TMP0]]
 ;
 entry:
   %elt0 = extractelement <4 x half> %a, i64 0
@@ -33,29 +22,10 @@ entry:
 }
 
 define half @reduction_half8(<8 x half> %vec8) {
-; GFX9-LABEL: @reduction_half8(
-; GFX9-NEXT:  entry:
-; GFX9-NEXT:    [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[VEC8:%.*]])
-; GFX9-NEXT:    ret half [[TMP0]]
-;
-; VI-LABEL: @reduction_half8(
-; VI-NEXT:  entry:
-; VI-NEXT:    [[ELT0:%.*]] = extractelement <8 x half> [[VEC8:%.*]], i64 0
-; VI-NEXT:    [[ELT1:%.*]] = extractelement <8 x half> [[VEC8]], i64 1
-; VI-NEXT:    [[ELT2:%.*]] = extractelement <8 x half> [[VEC8]], i64 2
-; VI-NEXT:    [[ELT3:%.*]] = extractelement <8 x half> [[VEC8]], i64 3
-; VI-NEXT:    [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4
-; VI-NEXT:    [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5
-; VI-NEXT:    [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6
-; VI-NEXT:    [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7
-; VI-NEXT:    [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
-; VI-NEXT:    [[ADD2:%.*]] = fadd fast half [[ELT2]], [[ADD1]]
-; VI-NEXT:    [[ADD3:%.*]] = fadd fast half [[ELT3]], [[ADD2]]
-; VI-NEXT:    [[ADD4:%.*]] = fadd fast half [[ELT4]], [[ADD3]]
-; VI-NEXT:    [[ADD5:%.*]] = fadd fast half [[ELT5]], [[ADD4]]
-; VI-NEXT:    [[ADD6:%.*]] = fadd fast half [[ELT6]], [[ADD5]]
-; VI-NEXT:    [[ADD7:%.*]] = fadd fast half [[ELT7]], [[ADD6]]
-; VI-NEXT:    ret half [[ADD7]]
+; GCN-LABEL: @reduction_half8(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[VEC8:%.*]])
+; GCN-NEXT:    ret half [[TMP0]]
 ;
 entry:
   %elt0 = extractelement <8 x half> %vec8, i64 0
@@ -86,38 +56,12 @@ define half @reduction_half16(<16 x half> %vec16) {
 ;
 ; VI-LABEL: @reduction_half16(
 ; VI-NEXT:  entry:
-; VI-NEXT:    [[ELT0:%.*]] = extractelement <16 x half> [[VEC16:%.*]], i64 0
-; VI-NEXT:    [[ELT1:%.*]] = extractelement <16 x half> [[VEC16]], i64 1
-; VI-NEXT:    [[ELT2:%.*]] = extractelement <16 x half> [[VEC16]], i64 2
-; VI-NEXT:    [[ELT3:%.*]] = extractelement <16 x half> [[VEC16]], i64 3
-; VI-NEXT:    [[ELT4:%.*]] = extractelement <16 x half> [[VEC16]], i64 4
-; VI-NEXT:    [[ELT5:%.*]] = extractelement <16 x half> [[VEC16]], i64 5
-; VI-NEXT:    [[ELT6:%.*]] = extractelement <16 x half> [[VEC16]], i64 6
-; VI-NEXT:    [[ELT7:%.*]] = extractelement <16 x half> [[VEC16]], i64 7
-; VI-NEXT:    [[ELT8:%.*]] = extractelement <16 x half> [[VEC16]], i64 8
-; VI-NEXT:    [[ELT9:%.*]] = extractelement <16 x half> [[VEC16]], i64 9
-; VI-NEXT:    [[ELT10:%.*]] = extractelement <16 x half> [[VEC16]], i64 10
-; VI-NEXT:    [[ELT11:%.*]] = extractelement <16 x half> [[VEC16]], i64 11
-; VI-NEXT:    [[ELT12:%.*]] = extractelement <16 x half> [[VEC16]], i64 12
-; VI-NEXT:    [[ELT13:%.*]] = extractelement <16 x half> [[VEC16]], i64 13
-; VI-NEXT:    [[ELT14:%.*]] = extractelement <16 x half> [[VEC16]], i64 14
-; VI-NEXT:    [[ELT15:%.*]] = extractelement <16 x half> [[VEC16]], i64 15
-; VI-NEXT:    [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
-; VI-NEXT:    [[ADD2:%.*]] = fadd fast half [[ELT2]], [[ADD1]]
-; VI-NEXT:    [[ADD3:%.*]] = fadd fast half [[ELT3]], [[ADD2]]
-; VI-NEXT:    [[ADD4:%.*]] = fadd fast half [[ELT4]], [[ADD3]]
-; VI-NEXT:    [[ADD5:%.*]] = fadd fast half [[ELT5]], [[ADD4]]
-; VI-NEXT:    [[ADD6:%.*]] = fadd fast half [[ELT6]], [[ADD5]]
-; VI-NEXT:    [[ADD7:%.*]] = fadd fast half [[ELT7]], [[ADD6]]
-; VI-NEXT:    [[ADD8:%.*]] = fadd fast half [[ELT8]], [[ADD7]]
-; VI-NEXT:    [[ADD9:%.*]] = fadd fast half [[ELT9]], [[ADD8]]
-; VI-NEXT:    [[ADD10:%.*]] = fadd fast half [[ELT10]], [[ADD9]]
-; VI-NEXT:    [[ADD11:%.*]] = fadd fast half [[ELT11]], [[ADD10]]
-; VI-NEXT:    [[ADD12:%.*]] = fadd fast half [[ELT12]], [[ADD11]]
-; VI-NEXT:    [[ADD13:%.*]] = fadd fast half [[ELT13]], [[ADD12]]
-; VI-NEXT:    [[ADD14:%.*]] = fadd fast half [[ELT14]], [[ADD13]]
-; VI-NEXT:    [[ADD15:%.*]] = fadd fast half [[ELT15]], [[ADD14]]
-; VI-NEXT:    ret half [[ADD15]]
+; VI-NEXT:    [[TMP0:%.*]] = shufflevector <16 x half> [[VEC16:%.*]], <16 x half> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; VI-NEXT:    [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[TMP0]])
+; VI-NEXT:    [[TMP2:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; VI-NEXT:    [[TMP3:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[TMP2]])
+; VI-NEXT:    [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[TMP3]]
+; VI-NEXT:    ret half [[OP_RDX]]
 ;
 entry:
   %elt0 = extractelement <16 x half> %vec16, i64 0
@@ -183,21 +127,10 @@ entry:
 }
 
 define i16 @reduction_v4i16(<4 x i16> %a) {
-; GFX9-LABEL: @reduction_v4i16(
-; GFX9-NEXT:  entry:
-; GFX9-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[A:%.*]])
-; GFX9-NEXT:    ret i16 [[TMP0]]
-;
-; VI-LABEL: @reduction_v4i16(
-; VI-NEXT:  entry:
-; VI-NEXT:    [[ELT0:%.*]] = extractelement <4 x i16> [[A:%.*]], i64 0
-; VI-NEXT:    [[ELT1:%.*]] = extractelement <4 x i16> [[A]], i64 1
-; VI-NEXT:    [[ELT2:%.*]] = extractelement <4 x i16> [[A]], i64 2
-; VI-NEXT:    [[ELT3:%.*]] = extractelement <4 x i16> [[A]], i64 3
-; VI-NEXT:    [[ADD1:%.*]] = add i16 [[ELT1]], [[ELT0]]
-; VI-NEXT:    [[ADD2:%.*]] = add i16 [[ELT2]], [[ADD1]]
-; VI-NEXT:    [[ADD3:%.*]] = add i16 [[ELT3]], [[ADD2]]
-; VI-NEXT:    ret i16 [[ADD3]]
+; GCN-LABEL: @reduction_v4i16(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[A:%.*]])
+; GCN-NEXT:    ret i16 [[TMP0]]
 ;
 entry:
   %elt0 = extractelement <4 x i16> %a, i64 0
@@ -213,29 +146,10 @@ entry:
 }
 
 define i16 @reduction_v8i16(<8 x i16> %vec8) {
-; GFX9-LABEL: @reduction_v8i16(
-; GFX9-NEXT:  entry:
-; GFX9-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[VEC8:%.*]])
-; GFX9-NEXT:    ret i16 [[TMP0]]
-;
-; VI-LABEL: @reduction_v8i16(
-; VI-NEXT:  entry:
-; VI-NEXT:    [[ELT0:%.*]] = extractelement <8 x i16> [[VEC8:%.*]], i64 0
-; VI-NEXT:    [[ELT1:%.*]] = extractelement <8 x i16> [[VEC8]], i64 1
-; VI-NEXT:    [[ELT2:%.*]] = extractelement <8 x i16> [[VEC8]], i64 2
-; VI-NEXT:    [[ELT3:%.*]] = extractelement <8 x i16> [[VEC8]], i64 3
-; VI-NEXT:    [[ELT4:%.*]] = extractelement <8 x i16> [[VEC8]], i64 4
-; VI-NEXT:    [[ELT5:%.*]] = extractelement <8 x i16> [[VEC8]], i64 5
-; VI-NEXT:    [[ELT6:%.*]] = extractelement <8 x i16> [[VEC8]], i64 6
-; VI-NEXT:    [[ELT7:%.*]] = extractelement <8 x i16> [[VEC8]], i64 7
-; VI-NEXT:    [[ADD1:%.*]] = add i16 [[ELT1]], [[ELT0]]
-; VI-NEXT:    [[ADD2:%.*]] = add i16 [[ELT2]], [[ADD1]]
-; VI-NEXT:    [[ADD3:%.*]] = add i16 [[ELT3]], [[ADD2]]
-; VI-NEXT:    [[ADD4:%.*]] = add i16 [[ELT4]], [[ADD3]]
-; VI-NEXT:    [[ADD5:%.*]] = add i16 [[ELT5]], [[ADD4]]
-; VI-NEXT:    [[ADD6:%.*]] = add i16 [[ELT6]], [[ADD5]]
-; VI-NEXT:    [[ADD7:%.*]] = add i16 [[ELT7]], [[ADD6]]
-; VI-NEXT:    ret i16 [[ADD7]]
+; GCN-LABEL: @reduction_v8i16(
+; GCN-NEXT:  entry:
+; GCN-NEXT:    [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[VEC8:%.*]])
+; GCN-NEXT:    ret i16 [[TMP0]]
 ;
 entry:
   %elt0 = extractelement <8 x i16> %vec8, i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
index 059e4c38b519..9608608a1809 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
@@ -155,13 +155,11 @@ define <4 x float> @exp_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @exp_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -173,13 +171,11 @@ define <4 x float> @exp_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
-; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
-; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
+; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -212,13 +208,11 @@ define <4 x float> @int_exp_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @int_exp_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -230,13 +224,11 @@ define <4 x float> @int_exp_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
-; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
-; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
+; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -269,13 +261,11 @@ define <4 x float> @log_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @log_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -287,13 +277,11 @@ define <4 x float> @log_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
-; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
-; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
+; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -326,13 +314,11 @@ define <4 x float> @int_log_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @int_log_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -344,13 +330,11 @@ define <4 x float> @int_log_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
-; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
-; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
+; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -383,13 +367,11 @@ define <4 x float> @sin_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @sin_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -401,13 +383,11 @@ define <4 x float> @sin_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
-; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
-; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
+; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -440,13 +420,11 @@ define <4 x float> @int_sin_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @int_sin_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -458,13 +436,11 @@ define <4 x float> @int_sin_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
-; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
-; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
-; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
+; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
new file mode 100644
index 000000000000..2daa3b58e5c3
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mattr=+v -slp-threshold=-11 < %s | FileCheck %s
+
+define <4 x i32> @test(<2 x i64> %v, ptr %p) {
+; CHECK-LABEL: define <4 x i32> @test(
+; CHECK-SAME: <2 x i64> [[V:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[P]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, <2 x ptr> [[TMP1]], <2 x i64> [[V]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP2]], i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> [[TMP6]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
+;
+entry:
+  %0 = extractelement <2 x i64> %v, i32 1
+  %arrayidx127.2 = getelementptr i16, ptr %p, i64 %0
+  %1 = load i16, ptr %arrayidx127.2, align 2
+  %conv128.2 = zext i16 %1 to i32
+  %2 = extractelement <2 x i64> %v, i32 0
+  %arrayidx127.3 = getelementptr i16, ptr %p, i64 %2
+  %3 = load i16, ptr %arrayidx127.3, align 2
+  %conv128.3 = zext i16 %3 to i32
+  %4 = insertelement <4 x i32> zeroinitializer, i32 %conv128.2, i32 0
+  %5 = insertelement <4 x i32> %4, i32 %conv128.3, i32 1
+  ret <4 x i32> %5
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
index 6c21cc1cfc5b..45ce1eec2cbf 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll
@@ -51,25 +51,23 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ;
 ; AVX-LABEL: @ceil_floor(
 ; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i64 1
-; AVX-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i64 2
 ; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; AVX-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX-NEXT:    [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
-; AVX-NEXT:    [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
 ; AVX-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]])
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]])
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; AVX-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
 ; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i64 1
-; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i64 2
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i64 3
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
+; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX-NEXT:    ret <8 x float> [[R71]]
 ;
 ; AVX2-LABEL: @ceil_floor(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
index bc5bcee36116..b8b284b9595a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll
@@ -51,25 +51,23 @@ define <8 x float> @ceil_floor(<8 x float> %a) {
 ;
 ; AVX-LABEL: @ceil_floor(
 ; AVX-NEXT:    [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <8 x float> [[A]], i64 1
-; AVX-NEXT:    [[A2:%.*]] = extractelement <8 x float> [[A]], i64 2
 ; AVX-NEXT:    [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3
 ; AVX-NEXT:    [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]])
-; AVX-NEXT:    [[AB1:%.*]] = call float @llvm.floor.f32(float [[A1]])
-; AVX-NEXT:    [[AB2:%.*]] = call float @llvm.floor.f32(float [[A2]])
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]])
 ; AVX-NEXT:    [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]])
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
-; AVX-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]])
-; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
-; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]])
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5>
+; AVX-NEXT:    [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]])
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7>
+; AVX-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]])
 ; AVX-NEXT:    [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0
-; AVX-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i64 1
-; AVX-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i64 2
-; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i64 3
-; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i64 3
+; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; AVX-NEXT:    ret <8 x float> [[R71]]
 ;
 ; AVX2-LABEL: @ceil_floor(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
index 4f35b77c50be..8701551f46ab 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
@@ -39,9 +39,10 @@ define <4 x i8> @h(<4 x i8> %x, <4 x i8> %y) {
 
 define <4 x i8> @h_undef(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-LABEL: @h_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 poison, i32 3, i32 5, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i8> [[TMP1]], [[TMP1]]
-; CHECK-NEXT:    ret <4 x i8> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> <i8 undef, i8 poison, i8 poison, i8 poison>, <4 x i32> <i32 4, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[Y:%.*]], <4 x i32> <i32 0, i32 1, i32 5, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i8> [[TMP2]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i8> [[TMP3]]
 ;
   %x0 = extractelement <4 x i8> undef, i32 0
   %x3 = extractelement <4 x i8> %x, i32 3
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
index 4a9f717918a0..b85ec5bce819 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
 
 ;
 ; 128-bit vectors
@@ -213,62 +213,16 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 
 ; PR50392
 define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: @test_v4f64_partial_swizzle(
-; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
-; SSE-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
-; SSE-NEXT:    ret <4 x double> [[R03]]
-;
-; SLM-LABEL: @test_v4f64_partial_swizzle(
-; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
-; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
-; SLM-NEXT:    ret <4 x double> [[R03]]
-;
-; AVX1-LABEL: @test_v4f64_partial_swizzle(
-; AVX1-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; AVX1-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
-; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
-; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
-; AVX1-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; AVX1-NEXT:    [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i64 0
-; AVX1-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; AVX1-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
-; AVX1-NEXT:    ret <4 x double> [[R031]]
-;
-; AVX2-LABEL: @test_v4f64_partial_swizzle(
-; AVX2-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; AVX2-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; AVX2-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
-; AVX2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; AVX2-NEXT:    [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i64 0
-; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
-; AVX2-NEXT:    ret <4 x double> [[R031]]
-;
-; AVX512-LABEL: @test_v4f64_partial_swizzle(
-; AVX512-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; AVX512-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; AVX512-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
-; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
-; AVX512-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
-; AVX512-NEXT:    ret <4 x double> [[R03]]
+; CHECK-LABEL: @test_v4f64_partial_swizzle(
+; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
+; CHECK-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
+; CHECK-NEXT:    ret <4 x double> [[R03]]
 ;
   %a0 = extractelement <4 x double> %a, i64 0
   %a1 = extractelement <4 x double> %a, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
index 31e3e6aa0a83..e30f84e4f17b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX
 
 ;
 ; 128-bit vectors
@@ -213,62 +213,16 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 
 ; PR50392
 define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: @test_v4f64_partial_swizzle(
-; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> <double undef, double poison>, <4 x i32> <i32 0, i32 2, i32 1, i32 poison>
-; SSE-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
-; SSE-NEXT:    ret <4 x double> [[R03]]
-;
-; SLM-LABEL: @test_v4f64_partial_swizzle(
-; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> <double undef, double poison>, <4 x i32> <i32 0, i32 2, i32 1, i32 poison>
-; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
-; SLM-NEXT:    ret <4 x double> [[R03]]
-;
-; AVX1-LABEL: @test_v4f64_partial_swizzle(
-; AVX1-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; AVX1-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; AVX1-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
-; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
-; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
-; AVX1-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; AVX1-NEXT:    [[R00:%.*]] = insertelement <4 x double> <double poison, double undef, double poison, double poison>, double [[R0]], i64 0
-; AVX1-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; AVX1-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX1-NEXT:    ret <4 x double> [[R031]]
-;
-; AVX2-LABEL: @test_v4f64_partial_swizzle(
-; AVX2-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; AVX2-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; AVX2-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
-; AVX2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
-; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
-; AVX2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; AVX2-NEXT:    [[R00:%.*]] = insertelement <4 x double> <double poison, double undef, double poison, double poison>, double [[R0]], i64 0
-; AVX2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; AVX2-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; AVX2-NEXT:    ret <4 x double> [[R031]]
-;
-; AVX512-LABEL: @test_v4f64_partial_swizzle(
-; AVX512-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; AVX512-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; AVX512-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
-; AVX512-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> <double undef, double poison>, <4 x i32> <i32 0, i32 2, i32 1, i32 poison>
-; AVX512-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
-; AVX512-NEXT:    ret <4 x double> [[R03]]
+; CHECK-LABEL: @test_v4f64_partial_swizzle(
+; CHECK-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; CHECK-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[R3:%.*]] = fadd double [[B2]], [[B3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> <double undef, double poison>, <4 x i32> <i32 0, i32 2, i32 1, i32 poison>
+; CHECK-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
+; CHECK-NEXT:    ret <4 x double> [[R03]]
 ;
   %a0 = extractelement <4 x double> %a, i64 0
   %a1 = extractelement <4 x double> %a, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scalarazied-result.ll b/llvm/test/Transforms/SLPVectorizer/X86/scalarazied-result.ll
index 53f17083bd4b..1d6e191c6f97 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scalarazied-result.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scalarazied-result.ll
@@ -4,10 +4,6 @@
 define void @test() {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x half> zeroinitializer, i64 1
-; CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp une half [[TMP0]], 0xH0000
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x half> zeroinitializer, i64 1
-; CHECK-NEXT:    [[TOBOOL3:%.*]] = fcmp une half [[TMP1]], 0xH0000
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll b/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll
index b8c551c7b771..9e8cdc62c729 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll
@@ -26,5 +26,5 @@ entry:
   unreachable
 }
 
-attributes #0 = { noreturn readonly uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="knl" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noreturn readonly uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/streaming-compatible-expand-masked-gather-scatter.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/streaming-compatible-expand-masked-gather-scatter.ll
index ee67ab341117..b827fc63c0ef 100644
--- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/streaming-compatible-expand-masked-gather-scatter.ll
+++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/AArch64/streaming-compatible-expand-masked-gather-scatter.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: aarch64-registered-target
-; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu -mattr=+sve -force-streaming-compatible-sve | FileCheck %s
+; RUN: opt -S %s -passes=scalarize-masked-mem-intrin -mtriple=aarch64-linux-gnu -mattr=+sve -force-streaming-compatible | FileCheck %s
 
 define <2 x i32> @scalarize_v2i32(<2 x ptr> %p, <2 x i1> %mask, <2 x i32> %passthru) {
 ; CHECK-LABEL: @scalarize_v2i32(
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-dead-default-lookup-table.ll b/llvm/test/Transforms/SimplifyCFG/switch-dead-default-lookup-table.ll
new file mode 100644
index 000000000000..bead0dc4c567
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/switch-dead-default-lookup-table.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt %s -S -passes='simplifycfg<switch-to-lookup>' -simplifycfg-require-and-preserve-domtree=1 -switch-range-to-icmp | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+define i64 @test_1(i64 %0) {
+; CHECK-LABEL: define i64 @test_1(
+; CHECK-SAME: i64 [[TMP0:%.*]]) {
+; CHECK-NEXT:  switch.lookup:
+; CHECK-NEXT:    [[TMP1:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [4 x i64], ptr @switch.table.test_1, i32 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i64, ptr [[SWITCH_GEP]], align 8
+; CHECK-NEXT:    ret i64 [[SWITCH_LOAD]]
+;
+  %2 = urem i64 %0, 4
+  switch i64 %2, label %5 [
+  i64 1, label %3
+  i64 2, label %3
+  i64 3, label %4
+  ]
+
+3:
+  br label %5
+
+4:
+  br label %5
+
+5:
+  %.0 = phi i64 [ 2, %4 ], [ 1, %3 ], [ 0, %1 ]
+  ret i64 %.0
+}
+
+
+define i64 @test_2(i64 %0) {
+; CHECK-LABEL: define i64 @test_2(
+; CHECK-SAME: i64 [[TMP0:%.*]]) {
+; CHECK-NEXT:  switch.lookup:
+; CHECK-NEXT:    [[TMP1:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+  %2 = urem i64 %0, 4
+  switch i64 %2, label %6 [
+  i64 1, label %3
+  i64 2, label %4
+  i64 3, label %5
+  ]
+
+3:
+  br label %6
+
+4:
+  br label %6
+
+5:
+  br label %6
+
+6:
+  %.0 = phi i64 [ 0, %1 ], [ 1, %3 ], [ 2, %4 ], [ 3, %5 ]
+  ret i64 %.0
+}
+
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll b/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll
index 7c0d5e4f2b65..4a457cc177e8 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll
@@ -79,15 +79,15 @@ default:
   ret void
 }
 
-; This one is a negative test - we know the value of the default,
-; but that's about it
+; We can replace the default branch with case 3 since it is the only case that is missing.
 define void @test3(i2 %a) {
 ; CHECK-LABEL: define void @test3(
 ; CHECK-SAME: i2 [[A:%.*]]) {
-; CHECK-NEXT:    switch i2 [[A]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:    switch i2 [[A]], label [[DOTUNREACHABLEDEFAULT:%.*]] [
 ; CHECK-NEXT:      i2 0, label [[CASE0:%.*]]
 ; CHECK-NEXT:      i2 1, label [[CASE1:%.*]]
 ; CHECK-NEXT:      i2 -2, label [[CASE2:%.*]]
+; CHECK-NEXT:      i2 -1, label [[DEFAULT:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
@@ -100,6 +100,8 @@ define void @test3(i2 %a) {
 ; CHECK:       case2:
 ; CHECK-NEXT:    call void @foo(i32 2)
 ; CHECK-NEXT:    br label [[COMMON_RET]]
+; CHECK:       .unreachabledefault:
+; CHECK-NEXT:    unreachable
 ; CHECK:       default:
 ; CHECK-NEXT:    call void @foo(i32 3)
 ; CHECK-NEXT:    br label [[COMMON_RET]]
@@ -122,6 +124,50 @@ default:
   ret void
 }
 
+define void @test3_prof(i2 %a) {
+; CHECK-LABEL: define void @test3_prof(
+; CHECK-SAME: i2 [[A:%.*]]) {
+; CHECK-NEXT:    switch i2 [[A]], label [[DOTUNREACHABLEDEFAULT:%.*]] [
+; CHECK-NEXT:      i2 0, label [[CASE0:%.*]]
+; CHECK-NEXT:      i2 1, label [[CASE1:%.*]]
+; CHECK-NEXT:      i2 -2, label [[CASE2:%.*]]
+; CHECK-NEXT:      i2 -1, label [[DEFAULT:%.*]]
+; CHECK-NEXT:    ], !prof [[PROF0:![0-9]+]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    ret void
+; CHECK:       case0:
+; CHECK-NEXT:    call void @foo(i32 0)
+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
+; CHECK:       case1:
+; CHECK-NEXT:    call void @foo(i32 1)
+; CHECK-NEXT:    br label [[COMMON_RET]]
+; CHECK:       case2:
+; CHECK-NEXT:    call void @foo(i32 2)
+; CHECK-NEXT:    br label [[COMMON_RET]]
+; CHECK:       .unreachabledefault:
+; CHECK-NEXT:    unreachable
+; CHECK:       default:
+; CHECK-NEXT:    call void @foo(i32 3)
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+  switch i2 %a, label %default [i2 0, label %case0
+  i2 1, label %case1
+  i2 2, label %case2], !prof !0
+
+case0:
+  call void @foo(i32 0)
+  ret void
+case1:
+  call void @foo(i32 1)
+  ret void
+case2:
+  call void @foo(i32 2)
+  ret void
+default:
+  call void @foo(i32 3)
+  ret void
+}
+
 ; Negative test - check for possible overflow when computing
 ; number of possible cases.
 define void @test4(i128 %a) {
@@ -267,3 +313,40 @@ default:
 
 declare void @llvm.assume(i1)
 
+define zeroext i1 @test8(i128 %a) {
+; We should not transform conditions wider than 64 bit.
+; CHECK-LABEL: define zeroext i1 @test8(
+; CHECK-SAME: i128 [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = and i128 [[A]], 3894222643901120721397872246915072
+; CHECK-NEXT:    switch i128 [[TMP0]], label [[LOR_RHS:%.*]] [
+; CHECK-NEXT:      i128 1298074214633706907132624082305024, label [[LOR_END:%.*]]
+; CHECK-NEXT:      i128 2596148429267413814265248164610048, label [[LOR_END]]
+; CHECK-NEXT:      i128 3894222643901120721397872246915072, label [[LOR_END]]
+; CHECK-NEXT:    ]
+; CHECK:       lor.rhs:
+; CHECK-NEXT:    br label [[LOR_END]]
+; CHECK:       lor.end:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[LOR_RHS]] ], [ true, [[ENTRY]] ], [ true, [[ENTRY]] ]
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+entry:
+  %0 = and i128 %a, 3894222643901120721397872246915072
+  switch i128 %0, label %lor.rhs [
+  i128 1298074214633706907132624082305024, label %lor.end
+  i128 2596148429267413814265248164610048, label %lor.end
+  i128 3894222643901120721397872246915072, label %lor.end
+  ]
+
+lor.rhs:                                          ; preds = %entry
+  br label %lor.end
+
+lor.end:                                          ; preds = %entry, %entry, %entry, %lor.rhs
+  %1 = phi i1 [ true, %entry ], [ false, %lor.rhs ], [ true, %entry ], [ true, %entry ]
+  ret i1 %1
+}
+
+!0 = !{!"branch_weights", i32 8, i32 4, i32 2, i32 1}
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 0, i32 4, i32 2, i32 1, i32 8}
+;.
diff --git a/llvm/test/Transforms/Util/add-TLI-mappings.ll b/llvm/test/Transforms/Util/add-TLI-mappings.ll
index 0e005ae75ef5..4e4b81e89a32 100644
--- a/llvm/test/Transforms/Util/add-TLI-mappings.ll
+++ b/llvm/test/Transforms/Util/add-TLI-mappings.ll
@@ -274,19 +274,19 @@ attributes #0 = { nounwind readnone }
 ; ARMPL-SAME:    _ZGVsMxvl4_modff(armpl_svmodf_f32_x)" }
 ; ARMPL:      attributes #[[SIN]] = { "vector-function-abi-variant"=
 ; ARMPL-SAME:    "_ZGV_LLVM_N2v_sin(armpl_vsinq_f64),
-; ARMPL-SAME     _ZGVsMxv_sin(armpl_svsin_f64_x)" }
+; ARMPL-SAME:    _ZGVsMxv_sin(armpl_svsin_f64_x)" }
 ; ARMPL:      attributes #[[SINCOS]] = { "vector-function-abi-variant"=
 ; ARMPL-SAME:    "_ZGV_LLVM_N2vl8l8_sincos(armpl_vsincosq_f64),
-; ARMPL-SAME:   _ZGVsMxvl8l8_sincos(armpl_svsincos_f64_x)" }
+; ARMPL-SAME:    _ZGVsMxvl8l8_sincos(armpl_svsincos_f64_x)" }
 ; ARMPL:      attributes #[[SINCOSF]] = { "vector-function-abi-variant"=
 ; ARMPL-SAME:    "_ZGV_LLVM_N4vl4l4_sincosf(armpl_vsincosq_f32),
 ; ARMPL-SAME:    _ZGVsMxvl4l4_sincosf(armpl_svsincos_f32_x)" }
 ; ARMPL:      attributes #[[SINCOSPI]] = { "vector-function-abi-variant"=
 ; ARMPL-SAME:    "_ZGV_LLVM_N2vl8l8_sincospi(armpl_vsincospiq_f64),
-; ARMPL-SAME:   _ZGVsMxvl8l8_sincospi(armpl_svsincospi_f64_x)" }
+; ARMPL-SAME:    _ZGVsMxvl8l8_sincospi(armpl_svsincospi_f64_x)" }
 ; ARMPL:      attributes #[[SINCOSPIF]] = { "vector-function-abi-variant"=
 ; ARMPL-SAME:    "_ZGV_LLVM_N4vl4l4_sincospif(armpl_vsincospiq_f32),
 ; ARMPL-SAME:    _ZGVsMxvl4l4_sincospif(armpl_svsincospi_f32_x)" }
 ; ARMPL:      attributes #[[LOG10]] = { "vector-function-abi-variant"=
 ; ARMPL-SAME:    "_ZGV_LLVM_N4v_llvm.log10.f32(armpl_vlog10q_f32),
-; ARMPL-SAME     _ZGVsMxv_llvm.log10.f32(armpl_svlog10_f32_x)" }
+; ARMPL-SAME:    _ZGVsMxv_llvm.log10.f32(armpl_svlog10_f32_x)" }
diff --git a/llvm/test/tools/llvm-driver/symlink-call.test b/llvm/test/tools/llvm-driver/symlink-call.test
index eeedf9edc73f..ca6098216b13 100644
--- a/llvm/test/tools/llvm-driver/symlink-call.test
+++ b/llvm/test/tools/llvm-driver/symlink-call.test
@@ -14,6 +14,8 @@
 # RUN: %t/cxxfilt-15 --help | FileCheck %s
 # RUN: ln -s %llvm %t/cxxfilt-15.exe
 # RUN: %t/cxxfilt-15.exe --help | FileCheck %s
+# RUN: ln -s %llvm %t/c++filt
+# RUN: %t/c++filt --help | FileCheck %s
 
 # RUN: ln -s %llvm %t/llvm-15
 # RUN: %t/llvm-15 cxxfilt --help | FileCheck %s
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
new file mode 100644
index 000000000000..ab81f9fb04af
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
@@ -0,0 +1,791 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 --timeline --timeline-max-iterations=4 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN GPR32-bit
+ldr w0, [sp]
+add x0, x0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR8-bit
+ldr b0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR16-bit
+ldr h0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR32-bit
+ldr s0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-b
+ld1 {v0.8b}, [sp]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-h
+ld1 {v0.4h}, [sp]
+add v0.8h, v0.8h, v0.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-s
+ld1 {v0.2s}, [sp]
+add v0.4s, v0.4s, v0.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-d
+ld1 {v0.1d}, [sp]
+add v0.2d, v0.2d, v0.2d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ins
+ins v0.b[0], v1.b[1]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN lanewise-load
+ld1 {v0.b}[0], [sp]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - GPR32-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      41
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    4.88
+# CHECK-NEXT: IPC:               4.88
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      4     0.33    *                   ldr	w0, [sp]
+# CHECK-NEXT:  1      1     0.25                        add	x0, x0, x0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34   0.22   0.22   0.28   0.28    -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ldr	w0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -     0.22   0.22   0.28   0.28    -      -      -      -     add	x0, x0, x0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeeeeER .   ldr	w0, [sp]
+# CHECK-NEXT: [0,1]     D====eER.   add	x0, x0, x0
+# CHECK-NEXT: [1,0]     DeeeeE-R.   ldr	w0, [sp]
+# CHECK-NEXT: [1,1]     D====eER.   add	x0, x0, x0
+# CHECK-NEXT: [2,0]     DeeeeE-R.   ldr	w0, [sp]
+# CHECK-NEXT: [2,1]     D====eER.   add	x0, x0, x0
+# CHECK-NEXT: [3,0]     D=eeeeER.   ldr	w0, [sp]
+# CHECK-NEXT: [3,1]     D=====eER   add	x0, x0, x0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    0.5       ldr	w0, [sp]
+# CHECK-NEXT: 1.     4     5.3    0.0    0.0       add	x0, x0, x0
+# CHECK-NEXT:        4     3.3    0.6    0.3       <total>
+
+# CHECK:      [1] Code Region - FPR8-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	b0, [sp]
+# CHECK-NEXT:  1      2     0.25                        fadd	d0, d0, d0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ldr	b0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   fadd	d0, d0, d0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	b0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	b0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	b0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	b0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   fadd	d0, d0, d0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	b0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       fadd	d0, d0, d0
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [2] Code Region - FPR16-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	h0, [sp]
+# CHECK-NEXT:  1      2     0.25                        fadd	d0, d0, d0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ldr	h0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   fadd	d0, d0, d0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	h0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	h0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	h0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	h0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   fadd	d0, d0, d0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	h0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       fadd	d0, d0, d0
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [3] Code Region - FPR32-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	s0, [sp]
+# CHECK-NEXT:  1      2     0.25                        fadd	d0, d0, d0
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ldr	s0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   fadd	d0, d0, d0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	s0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	s0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	s0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   fadd	d0, d0, d0
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	s0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   fadd	d0, d0, d0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	s0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       fadd	d0, d0, d0
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [4] Code Region - SIMD64-bit-b
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ld1	{ v0.8b }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [5] Code Region - SIMD64-bit-h
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	v0.8h, v0.8h, v0.8h
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ld1	{ v0.4h }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.8h, v0.8h, v0.8h
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	v0.8h, v0.8h, v0.8h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [6] Code Region - SIMD64-bit-s
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	v0.4s, v0.4s, v0.4s
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ld1	{ v0.2s }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.4s, v0.4s, v0.4s
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	v0.4s, v0.4s, v0.4s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [7] Code Region - SIMD64-bit-d
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	v0.2d, v0.2d, v0.2d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -     ld1	{ v0.1d }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.2d, v0.2d, v0.2d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	v0.2d, v0.2d, v0.2d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [8] Code Region - ins
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      403
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 0.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      2     0.25                        mov	v0.b[0], v1.b[1]
+# CHECK-NEXT:  1      2     0.25                        add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   0.50   0.50
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50   mov	v0.b[0], v1.b[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .    .  .   mov	v0.b[0], v1.b[1]
+# CHECK-NEXT: [0,1]     D==eeER   .    .  .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0]     D====eeER .    .  .   mov	v0.b[0], v1.b[1]
+# CHECK-NEXT: [1,1]     D======eeER    .  .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0]     D========eeER  .  .   mov	v0.b[0], v1.b[1]
+# CHECK-NEXT: [2,1]     D==========eeER.  .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0]     D============eeER .   mov	v0.b[0], v1.b[1]
+# CHECK-NEXT: [3,1]     D==============eeER   add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     7.0    0.3    0.0       mov	v0.b[0], v1.b[1]
+# CHECK-NEXT: 1.     4     9.0    0.0    0.0       add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:        4     8.0    0.1    0.0       <total>
+
+# CHECK:      [9] Code Region - lanewise-load
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      1003
+# CHECK-NEXT: Total uOps:        300
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.30
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 0.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      8     0.33    *                   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT:  1      2     0.25                        add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2]   - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4]   - V1UnitM0
+# CHECK-NEXT: [5]   - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7]   - V1UnitV0
+# CHECK-NEXT: [8]   - V1UnitV1
+# CHECK-NEXT: [9]   - V1UnitV2
+# CHECK-NEXT: [10]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -     0.50   0.50   0.50   0.50
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -     0.50    -     0.50   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT: [0,1]     D========eeER  .    .    .    .    .    . .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0]     D==========eeeeeeeeER    .    .    .    . .   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT: [1,1]     D==================eeER  .    .    .    . .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0]     D====================eeeeeeeeER    .    . .   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT: [2,1]     D============================eeER  .    . .   add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0]     D==============================eeeeeeeeER .   ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT: [3,1]     D======================================eeER   add	v0.16b, v0.16b, v0.16b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     16.0   0.3    0.0       ld1	{ v0.b }[0], [sp]
+# CHECK-NEXT: 1.     4     24.0   0.0    0.0       add	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:        4     20.0   0.1    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
new file mode 100644
index 000000000000..fd2083dc1277
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
@@ -0,0 +1,812 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v2 --timeline --timeline-max-iterations=4 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN FPR8-bit
+ldr b0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR16-bit
+ldr h0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR32-bit
+ldr s0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR64-bit
+ldr d0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR128-bit
+ldr q0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-b
+ld1 {v0.8b}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-h
+ld1 {v0.4h}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-s
+ld1 {v0.2s}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-d
+ld1 {v0.1d}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN insr
+insr z0.s, w0
+add z0.s, z0.s, z0.s
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - FPR8-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	b0, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ldr	b0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	b0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	b0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	b0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	b0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	b0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [1] Code Region - FPR16-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	h0, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ldr	h0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	h0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	h0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	h0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	h0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	h0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [2] Code Region - FPR32-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	s0, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ldr	s0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	s0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	s0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	s0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	s0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	s0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [3] Code Region - FPR64-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	d0, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ldr	d0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	d0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	d0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	d0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	d0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	d0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [4] Code Region - FPR128-bit
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ldr	q0, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ldr	q0, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ldr	q0, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ldr	q0, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	q0, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	q0, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	q0, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [5] Code Region - SIMD64-bit-b
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ld1	{ v0.8b }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [6] Code Region - SIMD64-bit-h
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ld1	{ v0.4h }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [7] Code Region - SIMD64-bit-s
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ld1	{ v0.2s }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [8] Code Region - SIMD64-bit-d
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      44
+# CHECK-NEXT: Total uOps:        200
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    4.55
+# CHECK-NEXT: IPC:               4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      6     0.33    *                   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT:  1      2     0.25                        add	z0.d, z0.d, z0.d
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -     0.33   0.33   0.34    -      -      -      -      -      -      -      -      -      -     ld1	{ v0.1d }, [sp]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	z0.d, z0.d, z0.d
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [0,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0]     DeeeeeeE--R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+
+# CHECK:      [9] Code Region - insr
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      200
+# CHECK-NEXT: Total Cycles:      803
+# CHECK-NEXT: Total uOps:        300
+
+# CHECK:      Dispatch Width:    16
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      6     1.00                        insr	z0.s, w0
+# CHECK-NEXT:  1      2     0.25                        add	z0.s, z0.s, z0.s
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2]   - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4]   - V2UnitM0
+# CHECK-NEXT: [5]   - V2UnitM1
+# CHECK-NEXT: [6]   - V2UnitS0
+# CHECK-NEXT: [7]   - V2UnitS1
+# CHECK-NEXT: [8]   - V2UnitS2
+# CHECK-NEXT: [9]   - V2UnitS3
+# CHECK-NEXT: [10]  - V2UnitV0
+# CHECK-NEXT: [11]  - V2UnitV1
+# CHECK-NEXT: [12]  - V2UnitV2
+# CHECK-NEXT: [13]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -      -     0.33   1.00   0.33   0.34
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -      -      -      -      -      -     1.00    -      -     insr	z0.s, w0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     0.33    -     0.33   0.34   add	z0.s, z0.s, z0.s
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .   .   insr	z0.s, w0
+# CHECK-NEXT: [0,1]     D======eeER    .    .    .    .   .   add	z0.s, z0.s, z0.s
+# CHECK-NEXT: [1,0]     D========eeeeeeER   .    .    .   .   insr	z0.s, w0
+# CHECK-NEXT: [1,1]     D==============eeER .    .    .   .   add	z0.s, z0.s, z0.s
+# CHECK-NEXT: [2,0]     D================eeeeeeER.    .   .   insr	z0.s, w0
+# CHECK-NEXT: [2,1]     D======================eeER   .   .   add	z0.s, z0.s, z0.s
+# CHECK-NEXT: [3,0]     D========================eeeeeeER .   insr	z0.s, w0
+# CHECK-NEXT: [3,1]     D==============================eeER   add	z0.s, z0.s, z0.s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     4     13.0   0.3    0.0       insr	z0.s, w0
+# CHECK-NEXT: 1.     4     19.0   0.0    0.0       add	z0.s, z0.s, z0.s
+# CHECK-NEXT:        4     16.0   0.1    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/X86/call-latency.s b/llvm/test/tools/llvm-mca/X86/call-latency.s
new file mode 100644
index 000000000000..9559d11f1b0a
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/call-latency.s
@@ -0,0 +1,58 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2                  -iterations=1 %s | FileCheck --check-prefixes=ALL,DEFAULT %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -call-latency=50 -iterations=1 %s | FileCheck --check-prefixes=ALL,CUSTOM %s
+
+callq printf
+
+# ALL:          Iterations:        1
+# ALL-NEXT:     Instructions:      1
+
+# CUSTOM-NEXT:  Total Cycles:      53
+# DEFAULT-NEXT: Total Cycles:      103
+
+# ALL-NEXT:     Total uOps:        1
+
+# ALL:          Dispatch Width:    2
+
+# CUSTOM-NEXT:  uOps Per Cycle:    0.02
+# CUSTOM-NEXT:  IPC:               0.02
+
+# DEFAULT-NEXT: uOps Per Cycle:    0.01
+# DEFAULT-NEXT: IPC:               0.01
+
+# ALL-NEXT:     Block RThroughput: 0.5
+
+# ALL:          Instruction Info:
+# ALL-NEXT:     [1]: #uOps
+# ALL-NEXT:     [2]: Latency
+# ALL-NEXT:     [3]: RThroughput
+# ALL-NEXT:     [4]: MayLoad
+# ALL-NEXT:     [5]: MayStore
+# ALL-NEXT:     [6]: HasSideEffects (U)
+
+# ALL:          [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# ALL-NEXT:      1      1     0.50                        callq	printf
+
+# ALL:          Resources:
+# ALL-NEXT:     [0]   - JALU0
+# ALL-NEXT:     [1]   - JALU1
+# ALL-NEXT:     [2]   - JDiv
+# ALL-NEXT:     [3]   - JFPA
+# ALL-NEXT:     [4]   - JFPM
+# ALL-NEXT:     [5]   - JFPU0
+# ALL-NEXT:     [6]   - JFPU1
+# ALL-NEXT:     [7]   - JLAGU
+# ALL-NEXT:     [8]   - JMul
+# ALL-NEXT:     [9]   - JSAGU
+# ALL-NEXT:     [10]  - JSTC
+# ALL-NEXT:     [11]  - JVALU0
+# ALL-NEXT:     [12]  - JVALU1
+# ALL-NEXT:     [13]  - JVIMUL
+
+# ALL:          Resource pressure per iteration:
+# ALL-NEXT:     [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]
+# ALL-NEXT:      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -
+
+# ALL:          Resource pressure by instruction:
+# ALL-NEXT:     [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   Instructions:
+# ALL-NEXT:      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     callq	printf
diff --git a/llvm/test/tools/llvm-objcopy/tool-options.test b/llvm/test/tools/llvm-objcopy/tool-options.test
new file mode 100644
index 000000000000..8d2bb4476009
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/tool-options.test
@@ -0,0 +1,6 @@
+## An error must be reported if a required argument value is missing.
+# RUN: not llvm-objcopy --only-section 2>&1 | FileCheck --check-prefix=CHECK-NO-VALUE-ONLY-SECTION %s
+# CHECK-NO-VALUE-ONLY-SECTION: error: argument to '--only-section' is missing (expected 1 value(s))
+
+# RUN: not llvm-objcopy -O 2>&1 | FileCheck --check-prefix=CHECK-NO-VALUE-O %s
+# CHECK-NO-VALUE-O: error: argument to '-O' is missing (expected 1 value(s))
diff --git a/llvm/test/tools/llvm-profdata/show-order-error.proftext b/llvm/test/tools/llvm-profdata/show-order-error.proftext
new file mode 100644
index 000000000000..633f1a9949b6
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/show-order-error.proftext
@@ -0,0 +1,27 @@
+# RUN: not llvm-profdata order %s --num-test-traces=10 2>&1 | FileCheck %s
+
+# CHECK: --num-test-traces must be smaller than the total number of traces
+
+# Header
+:ir
+:temporal_prof_traces
+# Num Traces
+1
+# Trace Stream Size:
+1
+# Weight
+1
+a, b
+
+a
+# Func Hash:
+0x1234
+# Num Counters:
+1
+# Counter Values:
+101
+
+b
+0x5678
+1
+202
diff --git a/llvm/test/tools/llvm-profdata/show-order.proftext b/llvm/test/tools/llvm-profdata/show-order.proftext
index 8ef26847ad77..28eb1b9b42af 100644
--- a/llvm/test/tools/llvm-profdata/show-order.proftext
+++ b/llvm/test/tools/llvm-profdata/show-order.proftext
@@ -1,4 +1,6 @@
-# RUN: llvm-profdata order %s | FileCheck %s
+# RUN: llvm-profdata order %s --num-test-traces=1 | FileCheck %s
+
+# CHECK: # Total area under the page fault curve: 4.000000e+00
 
 # CHECK: a
 # CHECK: b
@@ -9,9 +11,9 @@
 :ir
 :temporal_prof_traces
 # Num Traces
-3
+4
 # Trace Stream Size:
-3
+4
 # Weight
 1
 a, main.c:b, c
@@ -21,6 +23,9 @@ a, x, main.c:b, c
 # Weight
 1
 a, main.c:b, c
+# Weight
+1
+a, main.c:b, c, x
 
 a
 # Func Hash:
diff --git a/llvm/test/tools/llvm-profgen/profile-density.test b/llvm/test/tools/llvm-profgen/profile-density.test
index 0eb83838d16e..086697e8da0a 100644
--- a/llvm/test/tools/llvm-profgen/profile-density.test
+++ b/llvm/test/tools/llvm-profgen/profile-density.test
@@ -1,13 +1,17 @@
-; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t1 --use-offset=0 --show-density -hot-function-density-threshold=10  --trim-cold-profile=0 &> %t2
+; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t1 --use-offset=0 --show-density -profile-density-threshold=10  --trim-cold-profile=0 &> %t2
 ; RUN: FileCheck %s --input-file %t2 --check-prefix=CHECK-DENSITY
-
-; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t3 --show-density -hot-function-density-threshold=1 &> %t4
+; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t3 --show-density -profile-density-threshold=1 -profile-density-threshold=10000 &> %t4
 ; RUN: FileCheck %s --input-file %t4 --check-prefix=CHECK-DENSITY-CS
+; RUN: llvm-profgen --format=text --unsymbolized-profile=%S/Inputs/profile-density-cs.raw.prof --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t5 --show-density -profile-density-threshold=1 -profile-density-cutoff-hot=800000  &> %t6
+; RUN: FileCheck %s --input-file %t6 --check-prefix=CHECK-DENSITY-CS-80
+
+;CHECK-DENSITY: Sample PGO is estimated to optimize better with 2.9x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
+;CHECK-DENSITY: Functions with density >= 3.5 account for 99.00% total sample counts.
 
-;CHECK-DENSITY: Sample PGO is estimated to optimize better with 3.1x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
-;CHECK-DENSITY: Minimum profile density for hot functions with top 99.00% total samples: 3.2
+;CHECK-DENSITY-CS: Sample PGO is estimated to optimize better with 12.5x more samples. Please consider increasing sampling rate or profiling for longer duration to get more samples.
+;CHECK-DENSITY-CS: Functions with density >= 800.1 account for 99.00% total sample counts.
 
-;CHECK-DENSITY-CS: Minimum profile density for hot functions with top 99.00% total samples: 128.3
+;CHECK-DENSITY-CS-80: Functions with density >= 1886.2 account for 80.00% total sample counts.
 
 ; original code:
 ; clang -O3 -g -fno-optimize-sibling-calls -fdebug-info-for-profiling qsort.c -o a.out
diff --git a/llvm/test/tools/llvm-readobj/ELF/note-core-ntfile.test b/llvm/test/tools/llvm-readobj/ELF/note-core-ntfile.test
index 752cb723cd22..f4957b42a877 100644
--- a/llvm/test/tools/llvm-readobj/ELF/note-core-ntfile.test
+++ b/llvm/test/tools/llvm-readobj/ELF/note-core-ntfile.test
@@ -3,6 +3,7 @@
 # RUN: yaml2obj %s -o %t.o
 # RUN: llvm-readelf --notes %t.o | FileCheck %s --check-prefix=GNU
 # RUN: llvm-readobj --notes %t.o | FileCheck %s --check-prefix=LLVM
+# RUN: llvm-readobj --elf-output-style=JSON --pretty-print --notes %t.o | FileCheck %s --check-prefix=JSON
 
 ## llvm-mc doesn't support generating ET_CORE files; the 'Content' field was
 ## generated with the following steps:
@@ -72,24 +73,62 @@ ProgramHeaders:
 # LLVM-NEXT:       Data size: 0x80
 # LLVM-NEXT:       Type: NT_FILE (mapped files)
 # LLVM-NEXT:       Page Size: 4096
-# LLVM-NEXT:       Mapping [
+# LLVM-NEXT:       Mappings [
+# LLVM-NEXT:        {
 # LLVM-NEXT:         Start: 0x1000
 # LLVM-NEXT:         End: 0x2000
 # LLVM-NEXT:         Offset: 0x3000
 # LLVM-NEXT:         Filename: /path/to/a.out
-# LLVM-NEXT:       ]
-# LLVM-NEXT:       Mapping [
+# LLVM-NEXT:        }
+# LLVM-NEXT:        {
 # LLVM-NEXT:         Start: 0x4000
 # LLVM-NEXT:         End: 0x5000
 # LLVM-NEXT:         Offset: 0x6000
 # LLVM-NEXT:         Filename: /path/to/libc.so
-# LLVM-NEXT:       ]
-# LLVM-NEXT:       Mapping [
+# LLVM-NEXT:        }
+# LLVM-NEXT:        {
 # LLVM-NEXT:         Start: 0x7000
 # LLVM-NEXT:         End: 0x8000
 # LLVM-NEXT:         Offset: 0x9000
 # LLVM-NEXT:         Filename: [stack]
-# LLVM-NEXT:       ]
-# LLVM-NEXT:     }
+# LLVM-NEXT:       }
+# LLVM-NEXT:     ]
 # LLVM-NEXT:   }
+# LLVM-NEXT: }
 # LLVM-NEXT: ]
+
+# JSON:      "Notes": [
+# JSON-NEXT:  {
+# JSON-NEXT:      "NoteSection": {
+# JSON-NEXT:          "Name": "<?>",
+# JSON-NEXT:          "Offset": 120,
+# JSON-NEXT:          "Size": 148,
+# JSON-NEXT:          "Note": {
+# JSON-NEXT:              "Owner": "CORE",
+# JSON-NEXT:              "Data size": 128,
+# JSON-NEXT:              "Type": "NT_FILE (mapped files)",
+# JSON-NEXT:              "Page Size": 4096,
+# JSON-NEXT:              "Mappings": [
+# JSON-NEXT:                {
+# JSON-NEXT:                  "Start": 4096,
+# JSON-NEXT:                  "End": 8192,
+# JSON-NEXT:                  "Offset": 12288,
+# JSON-NEXT:                  "Filename": "/path/to/a.out"
+# JSON-NEXT:                },
+# JSON-NEXT:                {
+# JSON-NEXT:                  "Start": 16384,
+# JSON-NEXT:                  "End": 20480,
+# JSON-NEXT:                  "Offset": 24576,
+# JSON-NEXT:                  "Filename": "/path/to/libc.so"
+# JSON-NEXT:                },
+# JSON-NEXT:                {
+# JSON-NEXT:                  "Start": 28672,
+# JSON-NEXT:                  "End": 32768,
+# JSON-NEXT:                  "Offset": 36864,
+# JSON-NEXT:                  "Filename": "[stack]"
+# JSON-NEXT:                }
+# JSON-NEXT:            ]
+# JSON-NEXT:          }
+# JSON-NEXT:      }
+# JSON-NEXT: }
+# JSON-NEXT: ]
diff --git a/llvm/tools/llvm-cxxfilt/CMakeLists.txt b/llvm/tools/llvm-cxxfilt/CMakeLists.txt
index cbc4c2db6154..a644baffdd90 100644
--- a/llvm/tools/llvm-cxxfilt/CMakeLists.txt
+++ b/llvm/tools/llvm-cxxfilt/CMakeLists.txt
@@ -17,6 +17,10 @@ add_llvm_tool(llvm-cxxfilt
   GENERATE_DRIVER
   )
 
+if(LLVM_TOOL_LLVM_DRIVER_BUILD)
+  set_property(GLOBAL APPEND PROPERTY LLVM_DRIVER_HIDDEN_TOOL_ALIASES_llvm-cxxfilt c++filt)
+endif()
+
 if(LLVM_INSTALL_BINUTILS_SYMLINKS)
   add_llvm_tool_symlink(c++filt llvm-cxxfilt)
 endif()
diff --git a/llvm/tools/llvm-lto/llvm-lto.cpp b/llvm/tools/llvm-lto/llvm-lto.cpp
index f310097eec63..8218bd5a74ea 100644
--- a/llvm/tools/llvm-lto/llvm-lto.cpp
+++ b/llvm/tools/llvm-lto/llvm-lto.cpp
@@ -692,8 +692,9 @@ private:
       // Build a map of module to the GUIDs and summary objects that should
       // be written to its index.
       std::map<std::string, GVSummaryMapTy> ModuleToSummariesForIndex;
+      GVSummaryPtrSet DecSummaries;
       ThinGenerator.gatherImportedSummariesForModule(
-          *TheModule, *Index, ModuleToSummariesForIndex, *Input);
+          *TheModule, *Index, ModuleToSummariesForIndex, DecSummaries, *Input);
 
       std::string OutputName = OutputFilename;
       if (OutputName.empty()) {
@@ -703,7 +704,7 @@ private:
       std::error_code EC;
       raw_fd_ostream OS(OutputName, EC, sys::fs::OpenFlags::OF_None);
       error(EC, "error opening the file '" + OutputName + "'");
-      writeIndexToFile(*Index, OS, &ModuleToSummariesForIndex);
+      writeIndexToFile(*Index, OS, &ModuleToSummariesForIndex, &DecSummaries);
     }
   }
 
diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp
index 03d7d7944b9c..cc5d4f5fa05d 100644
--- a/llvm/tools/llvm-mca/llvm-mca.cpp
+++ b/llvm/tools/llvm-mca/llvm-mca.cpp
@@ -135,6 +135,11 @@ static cl::opt<unsigned>
                                "(instructions per cycle)"),
                       cl::cat(ToolOptions), cl::init(0));
 
+static cl::opt<unsigned>
+    CallLatency("call-latency", cl::Hidden,
+                cl::desc("Number of cycles to assume for a call instruction"),
+                cl::cat(ToolOptions), cl::init(100U));
+
 enum class SkipType { NONE, LACK_SCHED, PARSE_FAILURE, ANY_FAILURE };
 
 static cl::opt<enum SkipType> SkipUnsupportedInstructions(
@@ -568,7 +573,7 @@ int main(int argc, char **argv) {
   }
 
   // Create an instruction builder.
-  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM);
+  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM, CallLatency);
 
   // Create a context to control ownership of the pipeline hardware.
   mca::Context MCA(*MRI, *STI);
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
index a1897334cff2..4ab3b7265f2f 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -571,6 +571,12 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
   llvm::opt::InputArgList InputArgs =
       T.ParseArgs(ArgsArr, MissingArgumentIndex, MissingArgumentCount);
 
+  if (MissingArgumentCount)
+    return createStringError(
+        errc::invalid_argument,
+        "argument to '%s' is missing (expected %d value(s))",
+        InputArgs.getArgString(MissingArgumentIndex), MissingArgumentCount);
+
   if (InputArgs.size() == 0 && DashDash == RawArgsArr.end()) {
     printHelp(T, errs(), ToolType::Objcopy);
     exit(1);
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 693af066bc0f..28c3afa10164 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -340,7 +340,7 @@ cl::opt<unsigned long long> OverlapValueCutoff(
         "profile with max count value greater then the parameter value"),
     cl::sub(OverlapSubcommand));
 
-// Options unique to show subcommand.
+// Options specific to show subcommand.
 cl::opt<bool> ShowCounts("counts", cl::init(false),
                          cl::desc("Show counter values for shown functions"),
                          cl::sub(ShowSubcommand));
@@ -439,6 +439,14 @@ cl::opt<bool> ShowProfileVersion("profile-version", cl::init(false),
                                  cl::desc("Show profile version. "),
                                  cl::sub(ShowSubcommand));
 
+// Options specific to order subcommand.
+cl::opt<unsigned>
+    NumTestTraces("num-test-traces", cl::init(0),
+                  cl::desc("Keep aside the last <num-test-traces> traces in "
+                           "the profile when computing the function order and "
+                           "instead use them to evaluate that order"),
+                  cl::sub(OrderSubcommand));
+
 // We use this string to indicate that there are
 // multiple static functions map to the same name.
 const std::string DuplicateNameStr = "----";
@@ -3277,13 +3285,42 @@ static int order_main() {
     // Read all entries
     (void)I;
   }
-  auto &Traces = Reader->getTemporalProfTraces();
-  auto Nodes = TemporalProfTraceTy::createBPFunctionNodes(Traces);
+  ArrayRef Traces = Reader->getTemporalProfTraces();
+  if (NumTestTraces && NumTestTraces >= Traces.size())
+    exitWithError(
+        "--" + NumTestTraces.ArgStr +
+        " must be smaller than the total number of traces: expected: < " +
+        Twine(Traces.size()) + ", actual: " + Twine(NumTestTraces));
+  ArrayRef TestTraces = Traces.take_back(NumTestTraces);
+  Traces = Traces.drop_back(NumTestTraces);
+
+  std::vector<BPFunctionNode> Nodes;
+  TemporalProfTraceTy::createBPFunctionNodes(Traces, Nodes);
   BalancedPartitioningConfig Config;
   BalancedPartitioning BP(Config);
   BP.run(Nodes);
 
   OS << "# Ordered " << Nodes.size() << " functions\n";
+  if (!TestTraces.empty()) {
+    // Since we don't know the symbol sizes, we assume 32 functions per page.
+    DenseMap<BPFunctionNode::IDT, unsigned> IdToPageNumber;
+    for (auto &Node : Nodes)
+      IdToPageNumber[Node.Id] = IdToPageNumber.size() / 32;
+
+    SmallSet<unsigned, 0> TouchedPages;
+    unsigned Area = 0;
+    for (auto &Trace : TestTraces) {
+      for (auto Id : Trace.FunctionNameRefs) {
+        auto It = IdToPageNumber.find(Id);
+        if (It == IdToPageNumber.end())
+          continue;
+        TouchedPages.insert(It->getSecond());
+        Area += TouchedPages.size();
+      }
+      TouchedPages.clear();
+    }
+    OS << "# Total area under the page fault curve: " << (float)Area << "\n";
+  }
   OS << "# Warning: Mach-O may prefix symbols with \"_\" depending on the "
         "linkage and this output does not take that into account. Some "
         "post-processing may be required before passing to the linker via "
diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp
index e9442027aed3..e63c6d61b3bf 100644
--- a/llvm/tools/llvm-profgen/PerfReader.cpp
+++ b/llvm/tools/llvm-profgen/PerfReader.cpp
@@ -552,7 +552,7 @@ bool PerfScriptReader::extractLBRStack(TraceStream &TraceIt,
   //                           ... 0x4005c8/0x4005dc/P/-/-/0
   // It's in FIFO order and seperated by whitespace.
   SmallVector<StringRef, 32> Records;
-  TraceIt.getCurrentLine().split(Records, " ", -1, false);
+  TraceIt.getCurrentLine().rtrim().split(Records, " ", -1, false);
   auto WarnInvalidLBR = [](TraceStream &TraceIt) {
     WithColor::warning() << "Invalid address in LBR record at line "
                          << TraceIt.getLineNumber() << ": "
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index 5aa44108f966..2118e954fe54 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -75,14 +75,18 @@ static cl::opt<int, true> CSProfMaxContextDepth(
              "depth limit."),
     cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth));
 
-static cl::opt<double> HotFunctionDensityThreshold(
-    "hot-function-density-threshold", llvm::cl::init(1000),
-    llvm::cl::desc(
-        "specify density threshold for hot functions (default: 1000)"),
+static cl::opt<double> ProfileDensityThreshold(
+    "profile-density-threshold", llvm::cl::init(50),
+    llvm::cl::desc("If the profile density is below the given threshold, it "
+                   "will be suggested to increase the sampling rate."),
     llvm::cl::Optional);
 static cl::opt<bool> ShowDensity("show-density", llvm::cl::init(false),
                                  llvm::cl::desc("show profile density details"),
                                  llvm::cl::Optional);
+static cl::opt<int> ProfileDensityCutOffHot(
+    "profile-density-cutoff-hot", llvm::cl::init(990000),
+    llvm::cl::desc("Total samples cutoff for functions used to calculate "
+                   "profile density."));
 
 static cl::opt<bool> UpdateTotalSamples(
     "update-total-samples", llvm::cl::init(false),
@@ -179,21 +183,22 @@ void ProfileGeneratorBase::write() {
 
 void ProfileGeneratorBase::showDensitySuggestion(double Density) {
   if (Density == 0.0)
-    WithColor::warning() << "The --profile-summary-cutoff-hot option may be "
+    WithColor::warning() << "The output profile is empty or the "
+                            "--profile-density-cutoff-hot option is "
                             "set too low. Please check your command.\n";
-  else if (Density < HotFunctionDensityThreshold)
+  else if (Density < ProfileDensityThreshold)
     WithColor::warning()
         << "Sample PGO is estimated to optimize better with "
-        << format("%.1f", HotFunctionDensityThreshold / Density)
+        << format("%.1f", ProfileDensityThreshold / Density)
         << "x more samples. Please consider increasing sampling rate or "
            "profiling for longer duration to get more samples.\n";
 
   if (ShowDensity)
-    outs() << "Minimum profile density for hot functions with top "
+    outs() << "Functions with density >= " << format("%.1f", Density)
+           << " account for "
            << format("%.2f",
-                     static_cast<double>(ProfileSummaryCutoffHot.getValue()) /
-                         10000)
-           << "% total samples: " << format("%.1f", Density) << "\n";
+                     static_cast<double>(ProfileDensityCutOffHot) / 10000)
+           << "% total sample counts.\n";
 }
 
 bool ProfileGeneratorBase::filterAmbiguousProfile(FunctionSamples &FS) {
@@ -238,32 +243,6 @@ void ProfileGeneratorBase::filterAmbiguousProfile(SampleProfileMap &Profiles) {
   }
 }
 
-double ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles,
-                                              uint64_t HotCntThreshold) {
-  double Density = DBL_MAX;
-  std::vector<const FunctionSamples *> HotFuncs;
-  for (auto &I : Profiles) {
-    auto &FuncSamples = I.second;
-    if (FuncSamples.getTotalSamples() < HotCntThreshold)
-      continue;
-    HotFuncs.emplace_back(&FuncSamples);
-  }
-
-  for (auto *FuncSamples : HotFuncs) {
-    auto *Func = Binary->getBinaryFunction(FuncSamples->getFunction());
-    if (!Func)
-      continue;
-    uint64_t FuncSize = Func->getFuncSize();
-    if (FuncSize == 0)
-      continue;
-    Density =
-        std::min(Density, static_cast<double>(FuncSamples->getTotalSamples()) /
-                              FuncSize);
-  }
-
-  return Density == DBL_MAX ? 0.0 : Density;
-}
-
 void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges,
                                               const RangeSample &Ranges) {
 
@@ -768,9 +747,95 @@ void ProfileGenerator::populateBoundarySamplesForAllFunctions(
   }
 }
 
+void ProfileGeneratorBase::calculateBodySamplesAndSize(
+    const FunctionSamples &FSamples, uint64_t &TotalBodySamples,
+    uint64_t &FuncBodySize) {
+  // Note that ideally the size should be the number of function instruction.
+  // However, for probe-based profile, we don't have the accurate instruction
+  // count for each probe, instead, the probe sample is the samples count for
+  // the block, which is equivelant to
+  // total_instruction_samples/num_of_instruction in one block. Hence, we use
+  // the number of probe as a proxy for the function's size.
+  FuncBodySize += FSamples.getBodySamples().size();
+
+  // The accumulated body samples re-calculated here could be different from the
+  // TotalSamples(getTotalSamples) field of FunctionSamples for line-number
+  // based profile. The reason is that TotalSamples is the sum of all the
+  // samples of the machine instruction in one source-code line, however, the
+  // entry of Bodysamples is the only max number of them, so the TotalSamples is
+  // usually much bigger than the accumulated body samples as one souce-code
+  // line can emit many machine instructions. We observed a regression when we
+  // switched to use the accumulated body samples(by using
+  // -update-total-samples). Hence, it's safer to re-calculate here to avoid
+  // such discrepancy. There is no problem for probe-based profile, as the
+  // TotalSamples is exactly the same as the accumulated body samples.
+  for (const auto &I : FSamples.getBodySamples())
+    TotalBodySamples += I.second.getSamples();
+
+  for (const auto &CallsiteSamples : FSamples.getCallsiteSamples())
+    for (const auto &Callee : CallsiteSamples.second) {
+      // For binary-level density, the inlinees' samples and size should be
+      // included in the calculation.
+      calculateBodySamplesAndSize(Callee.second, TotalBodySamples,
+                                  FuncBodySize);
+    }
+}
+
+// Calculate Profile-density:
+// Calculate the density for each function and sort them in descending order,
+// keep accumulating their total samples unitl it exceeds the
+// percentage_threshold(cut-off) of total profile samples, the profile-density
+// is the last(minimum) function-density of the processed functions, which means
+// all the functions hot to perf are on good density if the profile-density is
+// good. The percentage_threshold(--profile-density-cutoff-hot) is configurable
+// depending on how much regression the system want to tolerate.
+double
+ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles) {
+  double ProfileDensity = 0.0;
+
+  uint64_t TotalProfileSamples = 0;
+  // A list of the function profile density and its total samples.
+  std::vector<std::pair<double, uint64_t>> FuncDensityList;
+  for (const auto &I : Profiles) {
+    uint64_t TotalBodySamples = 0;
+    uint64_t FuncBodySize = 0;
+    calculateBodySamplesAndSize(I.second, TotalBodySamples, FuncBodySize);
+
+    if (FuncBodySize == 0)
+      continue;
+
+    double FuncDensity = static_cast<double>(TotalBodySamples) / FuncBodySize;
+    TotalProfileSamples += TotalBodySamples;
+    FuncDensityList.emplace_back(FuncDensity, TotalBodySamples);
+  }
+
+  // Sorted by the density in descending order.
+  llvm::stable_sort(FuncDensityList, [&](const std::pair<double, uint64_t> &A,
+                                         const std::pair<double, uint64_t> &B) {
+    if (A.first != B.first)
+      return A.first > B.first;
+    return A.second < B.second;
+  });
+
+  uint64_t AccumulatedSamples = 0;
+  uint32_t I = 0;
+  assert(ProfileDensityCutOffHot <= 1000000 &&
+         "The cutoff value is greater than 1000000(100%)");
+  while (AccumulatedSamples < TotalProfileSamples *
+                                  static_cast<float>(ProfileDensityCutOffHot) /
+                                  1000000 &&
+         I < FuncDensityList.size()) {
+    AccumulatedSamples += FuncDensityList[I].second;
+    ProfileDensity = FuncDensityList[I].first;
+    I++;
+  }
+
+  return ProfileDensity;
+}
+
 void ProfileGeneratorBase::calculateAndShowDensity(
     const SampleProfileMap &Profiles) {
-  double Density = calculateDensity(Profiles, HotCountThreshold);
+  double Density = calculateDensity(Profiles);
   showDensitySuggestion(Density);
 }
 
@@ -1057,17 +1122,13 @@ void CSProfileGenerator::postProcessProfiles() {
             CSProfMaxColdContextDepth, EnableCSPreInliner);
   }
 
-  // Merge function samples of CS profile to calculate profile density.
-  sampleprof::SampleProfileMap ContextLessProfiles;
-  ProfileConverter::flattenProfile(ProfileMap, ContextLessProfiles, true);
-
-  calculateAndShowDensity(ContextLessProfiles);
   if (GenCSNestedProfile) {
     ProfileConverter CSConverter(ProfileMap);
     CSConverter.convertCSProfiles();
     FunctionSamples::ProfileIsCS = false;
   }
   filterAmbiguousProfile(ProfileMap);
+  ProfileGeneratorBase::calculateAndShowDensity(ProfileMap);
 }
 
 void ProfileGeneratorBase::computeSummaryAndThreshold(
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h
index d258fb78bfb1..5e36128530cd 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.h
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.h
@@ -116,10 +116,13 @@ protected:
 
   void computeSummaryAndThreshold(SampleProfileMap &ProfileMap);
 
-  void calculateAndShowDensity(const SampleProfileMap &Profiles);
+  void calculateBodySamplesAndSize(const FunctionSamples &FSamples,
+                                   uint64_t &TotalBodySamples,
+                                   uint64_t &FuncBodySize);
+
+  double calculateDensity(const SampleProfileMap &Profiles);
 
-  double calculateDensity(const SampleProfileMap &Profiles,
-                          uint64_t HotCntThreshold);
+  void calculateAndShowDensity(const SampleProfileMap &Profiles);
 
   void showDensitySuggestion(double Density);
 
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index a752cc401529..966531ef5609 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -7840,8 +7840,9 @@ static bool printLLVMOMPOFFLOADNoteLLVMStyle(uint32_t NoteType,
 
 static void printCoreNoteLLVMStyle(const CoreNote &Note, ScopedPrinter &W) {
   W.printNumber("Page Size", Note.PageSize);
+  ListScope D(W, "Mappings");
   for (const CoreFileMapping &Mapping : Note.Mappings) {
-    ListScope D(W, "Mapping");
+    DictScope D(W);
     W.printHex("Start", Mapping.Start);
     W.printHex("End", Mapping.End);
     W.printHex("Offset", Mapping.Offset);
diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp
index 8ec120d70e99..ac2075cb4af4 100644
--- a/llvm/unittests/IR/ConstantRangeTest.cpp
+++ b/llvm/unittests/IR/ConstantRangeTest.cpp
@@ -209,6 +209,10 @@ static bool CheckAll(const ConstantRange &, const ConstantRange &) {
   return true;
 }
 
+static bool CheckCorrectnessOnly(const ConstantRange &, const ConstantRange &) {
+  return false;
+}
+
 static bool CheckSingleElementsOnly(const ConstantRange &CR1,
                                     const ConstantRange &CR2) {
   return CR1.isSingleElement() && CR2.isSingleElement();
@@ -1019,18 +1023,102 @@ TEST_F(ConstantRangeTest, Multiply) {
       });
 }
 
+TEST_F(ConstantRangeTest, MultiplyWithNoWrap) {
+  using OBO = OverflowingBinaryOperator;
+
+  EXPECT_EQ(Empty.multiplyWithNoWrap(Some, OBO::NoUnsignedWrap), Empty);
+  EXPECT_EQ(Some.multiplyWithNoWrap(Empty, OBO::NoUnsignedWrap), Empty);
+  EXPECT_EQ(Full.multiplyWithNoWrap(Full, OBO::NoUnsignedWrap), Full);
+  EXPECT_EQ(Full.multiplyWithNoWrap(Some, OBO::NoUnsignedWrap), Full);
+  EXPECT_EQ(Some.multiplyWithNoWrap(Full, OBO::NoUnsignedWrap), Full);
+  EXPECT_EQ(ConstantRange(APInt(4, 0), APInt(4, 2))
+                .multiplyWithNoWrap(ConstantRange(APInt(4, 2), APInt(4, 0)),
+                                    OBO::NoUnsignedWrap),
+            ConstantRange::getFull(4));
+  EXPECT_EQ(ConstantRange(APInt(4, 1), APInt(4, 5))
+                .multiplyWithNoWrap(ConstantRange(APInt(4, 1), APInt(4, 5)),
+                                    OBO::NoUnsignedWrap),
+            ConstantRange(APInt(4, 1), APInt(4, 0)));
+  EXPECT_EQ(ConstantRange(APInt(8, 254), APInt(8, 0))
+                .multiplyWithNoWrap(ConstantRange(APInt(8, 252), APInt(8, 4)),
+                                    OBO::NoUnsignedWrap),
+            ConstantRange(APInt(8, 250), APInt(8, 9)));
+  EXPECT_EQ(ConstantRange(APInt(8, 254), APInt(8, 255))
+                .multiplyWithNoWrap(ConstantRange(APInt(8, 2), APInt(8, 4)),
+                                    OBO::NoUnsignedWrap),
+            ConstantRange::getEmpty(8));
+
+  EXPECT_EQ(Empty.multiplyWithNoWrap(Some, OBO::NoSignedWrap), Empty);
+  EXPECT_EQ(Some.multiplyWithNoWrap(Empty, OBO::NoSignedWrap), Empty);
+  EXPECT_EQ(Full.multiplyWithNoWrap(Full, OBO::NoSignedWrap), Full);
+  EXPECT_EQ(Full.multiplyWithNoWrap(Some, OBO::NoSignedWrap), Full);
+  EXPECT_EQ(Some.multiplyWithNoWrap(Full, OBO::NoSignedWrap), Full);
+  EXPECT_EQ(
+      ConstantRange(APInt(4, 0), APInt(4, 4))
+          .multiplyWithNoWrap(ConstantRange(APInt(4, -5, true), APInt(4, 4)),
+                              OBO::NoSignedWrap),
+      ConstantRange::getFull(4));
+  EXPECT_EQ(ConstantRange(APInt(4, 0), APInt(4, 3))
+                .multiplyWithNoWrap(ConstantRange(APInt(4, 0), APInt(4, 5)),
+                                    OBO::NoSignedWrap),
+            ConstantRange(APInt(4, 0), APInt(4, -8, true)));
+  EXPECT_EQ(ConstantRange(APInt(8, 3), APInt(8, -11, true))
+                .multiplyWithNoWrap(ConstantRange(APInt(8, -1, true)),
+                                    OBO::NoSignedWrap),
+            ConstantRange(APInt(8, 12), APInt(8, -2, true)));
+  EXPECT_EQ(ConstantRange(APInt(8, 254), APInt(8, 255))
+                .multiplyWithNoWrap(ConstantRange(APInt(8, 100), APInt(8, 121)),
+                                    OBO::NoSignedWrap),
+            ConstantRange::getEmpty(8));
+
+  TestBinaryOpExhaustive(
+      [](const ConstantRange &CR1, const ConstantRange &CR2) {
+        return CR1.multiplyWithNoWrap(CR2, OBO::NoUnsignedWrap);
+      },
+      [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
+        bool IsOverflow;
+        APInt Res = N1.umul_ov(N2, IsOverflow);
+        if (IsOverflow)
+          return std::nullopt;
+        return Res;
+      },
+      PreferSmallest, CheckCorrectnessOnly);
+  TestBinaryOpExhaustive(
+      [](const ConstantRange &CR1, const ConstantRange &CR2) {
+        return CR1.multiplyWithNoWrap(CR2, OBO::NoSignedWrap);
+      },
+      [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
+        bool IsOverflow;
+        APInt Res = N1.smul_ov(N2, IsOverflow);
+        if (IsOverflow)
+          return std::nullopt;
+        return Res;
+      },
+      PreferSmallest, CheckCorrectnessOnly);
+  TestBinaryOpExhaustive(
+      [](const ConstantRange &CR1, const ConstantRange &CR2) {
+        return CR1.multiplyWithNoWrap(CR2,
+                                      OBO::NoUnsignedWrap | OBO::NoSignedWrap);
+      },
+      [](const APInt &N1, const APInt &N2) -> std::optional<APInt> {
+        bool IsOverflow1, IsOverflow2;
+        APInt Res1 = N1.umul_ov(N2, IsOverflow1);
+        APInt Res2 = N1.smul_ov(N2, IsOverflow2);
+        if (IsOverflow1 || IsOverflow2)
+          return std::nullopt;
+        assert(Res1 == Res2 && "Multiplication results differ?");
+        return Res1;
+      },
+      PreferSmallest, CheckCorrectnessOnly);
+}
+
 TEST_F(ConstantRangeTest, smul_fast) {
   TestBinaryOpExhaustive(
       [](const ConstantRange &CR1, const ConstantRange &CR2) {
         return CR1.smul_fast(CR2);
       },
-      [](const APInt &N1, const APInt &N2) {
-        return N1 * N2;
-      },
-      PreferSmallest,
-      [](const ConstantRange &, const ConstantRange &) {
-        return false; // Check correctness only.
-      });
+      [](const APInt &N1, const APInt &N2) { return N1 * N2; }, PreferSmallest,
+      CheckCorrectnessOnly);
 }
 
 TEST_F(ConstantRangeTest, UMax) {
diff --git a/llvm/unittests/IR/MDBuilderTest.cpp b/llvm/unittests/IR/MDBuilderTest.cpp
index 2b5ab81b6066..4656c70ce9ca 100644
--- a/llvm/unittests/IR/MDBuilderTest.cpp
+++ b/llvm/unittests/IR/MDBuilderTest.cpp
@@ -127,4 +127,43 @@ TEST_F(MDBuilderTest, createPCSections) {
   EXPECT_EQ(mdconst::extract<ConstantInt>(Aux->getOperand(1))->getValue(),
             C2->getValue());
 }
+TEST_F(MDBuilderTest, createCallbackAndMerge) {
+  MDBuilder MDHelper(Context);
+  auto *CB1 = MDHelper.createCallbackEncoding(0, {1, -1}, false);
+  auto *CB2 = MDHelper.createCallbackEncoding(2, {-1}, false);
+  ASSERT_EQ(CB1->getNumOperands(), 4U);
+  ASSERT_TRUE(isa<ConstantAsMetadata>(CB1->getOperand(0)));
+  ASSERT_TRUE(isa<ConstantAsMetadata>(CB1->getOperand(1)));
+  ASSERT_TRUE(isa<ConstantAsMetadata>(CB1->getOperand(2)));
+  ASSERT_TRUE(isa<ConstantAsMetadata>(CB1->getOperand(3)));
+  EXPECT_EQ(mdconst::extract<ConstantInt>(CB1->getOperand(0))->getValue(), 0);
+  EXPECT_EQ(mdconst::extract<ConstantInt>(CB1->getOperand(1))->getValue(), 1);
+  EXPECT_EQ(mdconst::extract<ConstantInt>(CB1->getOperand(2))->getValue(), -1);
+  EXPECT_EQ(mdconst::extract<ConstantInt>(CB1->getOperand(3))->getValue(),
+            false);
+  ASSERT_EQ(CB2->getNumOperands(), 3U);
+  ASSERT_TRUE(isa<ConstantAsMetadata>(CB2->getOperand(0)));
+  ASSERT_TRUE(isa<ConstantAsMetadata>(CB2->getOperand(1)));
+  ASSERT_TRUE(isa<ConstantAsMetadata>(CB2->getOperand(2)));
+  EXPECT_EQ(mdconst::extract<ConstantInt>(CB2->getOperand(0))->getValue(), 2);
+  EXPECT_EQ(mdconst::extract<ConstantInt>(CB2->getOperand(1))->getValue(), -1);
+  EXPECT_EQ(mdconst::extract<ConstantInt>(CB2->getOperand(2))->getValue(),
+            false);
+  auto *CBList = MDNode::get(Context, {CB1, CB2});
+  auto *CB3 = MDHelper.createCallbackEncoding(4, {5}, false);
+  auto *NewCBList = MDHelper.mergeCallbackEncodings(CBList, CB3);
+  ASSERT_EQ(NewCBList->getNumOperands(), 3U);
+  EXPECT_TRUE(NewCBList->getOperand(0) == CB1);
+  EXPECT_TRUE(NewCBList->getOperand(1) == CB2);
+  EXPECT_TRUE(NewCBList->getOperand(2) == CB3);
+
+  ASSERT_EQ(CB3->getNumOperands(), 3U);
+  ASSERT_TRUE(isa<ConstantAsMetadata>(CB3->getOperand(0)));
+  ASSERT_TRUE(isa<ConstantAsMetadata>(CB3->getOperand(1)));
+  ASSERT_TRUE(isa<ConstantAsMetadata>(CB3->getOperand(2)));
+  EXPECT_EQ(mdconst::extract<ConstantInt>(CB3->getOperand(0))->getValue(), 4);
+  EXPECT_EQ(mdconst::extract<ConstantInt>(CB3->getOperand(1))->getValue(), 5);
+  EXPECT_EQ(mdconst::extract<ConstantInt>(CB3->getOperand(2))->getValue(),
+            false);
+}
 } // namespace
diff --git a/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp b/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
index 6af6f1bcdc40..24586b5aa31a 100644
--- a/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
+++ b/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/BalancedPartitioning.h"
-#include "llvm/Testing/Support/SupportHelpers.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
@@ -31,22 +30,32 @@ TEST(BPFunctionNodeTest, Basic) {
                        UnorderedElementsAreArray(UNs)));
   };
 
-  auto Nodes = TemporalProfTraceTy::createBPFunctionNodes({
-      TemporalProfTraceTy({0, 1, 2, 3}),
-  });
+  std::vector<BPFunctionNode> Nodes;
+  TemporalProfTraceTy::createBPFunctionNodes(
+      {TemporalProfTraceTy({0, 1, 2, 3})}, Nodes, /*RemoveOutlierUNs=*/false);
+  // Utility nodes that are too infrequent or too prevalent are filtered out.
   EXPECT_THAT(Nodes,
               UnorderedElementsAre(NodeIs(0, {0, 1, 2}), NodeIs(1, {1, 2}),
-                                   NodeIs(2, {1, 2}), NodeIs(3, {2})));
+                                   NodeIs(2, {2}), NodeIs(3, {2})));
 
-  Nodes = TemporalProfTraceTy::createBPFunctionNodes({
-      TemporalProfTraceTy({0, 1, 2, 3, 4}),
-      TemporalProfTraceTy({4, 2}),
-  });
+  Nodes.clear();
+  TemporalProfTraceTy::createBPFunctionNodes(
+      {TemporalProfTraceTy({0, 1, 2, 3, 4}), TemporalProfTraceTy({4, 2})},
+      Nodes, /*RemoveOutlierUNs=*/false);
 
   EXPECT_THAT(Nodes,
-              UnorderedElementsAre(NodeIs(0, {0, 1, 2}), NodeIs(1, {1, 2}),
-                                   NodeIs(2, {1, 2, 4, 5}), NodeIs(3, {2}),
-                                   NodeIs(4, {2, 3, 4, 5})));
+              UnorderedElementsAre(NodeIs(0, {0, 1, 2, 3}),
+                                   NodeIs(1, {1, 2, 3}), NodeIs(2, {2, 3, 5}),
+                                   NodeIs(3, {2, 3}), NodeIs(4, {3, 4, 5})));
+
+  Nodes.clear();
+  TemporalProfTraceTy::createBPFunctionNodes(
+      {TemporalProfTraceTy({0, 1, 2, 3, 4}), TemporalProfTraceTy({4, 2})},
+      Nodes, /*RemoveOutlierUNs=*/true);
+
+  EXPECT_THAT(Nodes, UnorderedElementsAre(NodeIs(0, {1}), NodeIs(1, {1}),
+                                          NodeIs(2, {5}), NodeIs(3, {}),
+                                          NodeIs(4, {5})));
 }
 
 } // end namespace llvm
diff --git a/llvm/unittests/Support/LEB128Test.cpp b/llvm/unittests/Support/LEB128Test.cpp
index 60f5ddd568ca..5aa7139c45a7 100644
--- a/llvm/unittests/Support/LEB128Test.cpp
+++ b/llvm/unittests/Support/LEB128Test.cpp
@@ -147,7 +147,8 @@ TEST(LEB128Test, DecodeULEB128) {
 TEST(LEB128Test, DecodeInvalidULEB128) {
 #define EXPECT_INVALID_ULEB128(VALUE, ERROR_OFFSET)                            \
   do {                                                                         \
-    const uint8_t *Value = reinterpret_cast<const uint8_t *>(VALUE);           \
+    const char *DefaultValue = VALUE;                                          \
+    const uint8_t *Value = reinterpret_cast<const uint8_t *>(DefaultValue);    \
     const char *Error = nullptr;                                               \
     unsigned ErrorOffset = 0;                                                  \
     uint64_t Actual =                                                          \
@@ -155,12 +156,13 @@ TEST(LEB128Test, DecodeInvalidULEB128) {
     EXPECT_NE(Error, nullptr);                                                 \
     EXPECT_EQ(0ul, Actual);                                                    \
     EXPECT_EQ(ERROR_OFFSET, ErrorOffset);                                      \
-    Value = reinterpret_cast<const uint8_t *>(VALUE);                          \
+    Value = reinterpret_cast<const uint8_t *>(DefaultValue);                   \
     Error = nullptr;                                                           \
     Actual = decodeULEB128AndInc(Value, Value + strlen(VALUE), &Error);        \
     EXPECT_NE(Error, nullptr);                                                 \
     EXPECT_EQ(0ul, Actual);                                                    \
-    EXPECT_EQ(ERROR_OFFSET, Value - reinterpret_cast<const uint8_t *>(VALUE)); \
+    EXPECT_EQ(ERROR_OFFSET,                                                    \
+              Value - reinterpret_cast<const uint8_t *>(DefaultValue));        \
   } while (0)
 
   // Buffer overflow.
@@ -222,7 +224,8 @@ TEST(LEB128Test, DecodeSLEB128) {
 TEST(LEB128Test, DecodeInvalidSLEB128) {
 #define EXPECT_INVALID_SLEB128(VALUE, ERROR_OFFSET)                            \
   do {                                                                         \
-    const uint8_t *Value = reinterpret_cast<const uint8_t *>(VALUE);           \
+    const char *DefaultValue = VALUE;                                          \
+    const uint8_t *Value = reinterpret_cast<const uint8_t *>(DefaultValue);    \
     const char *Error = nullptr;                                               \
     unsigned ErrorOffset = 0;                                                  \
     uint64_t Actual =                                                          \
@@ -230,12 +233,13 @@ TEST(LEB128Test, DecodeInvalidSLEB128) {
     EXPECT_NE(Error, nullptr);                                                 \
     EXPECT_EQ(0ul, Actual);                                                    \
     EXPECT_EQ(ERROR_OFFSET, ErrorOffset);                                      \
-    Value = reinterpret_cast<const uint8_t *>(VALUE);                          \
+    Value = reinterpret_cast<const uint8_t *>(DefaultValue);                   \
     Error = nullptr;                                                           \
     Actual = decodeSLEB128AndInc(Value, Value + strlen(VALUE), &Error);        \
     EXPECT_NE(Error, nullptr);                                                 \
     EXPECT_EQ(0ul, Actual);                                                    \
-    EXPECT_EQ(ERROR_OFFSET, Value - reinterpret_cast<const uint8_t *>(VALUE)); \
+    EXPECT_EQ(ERROR_OFFSET,                                                    \
+              Value - reinterpret_cast<const uint8_t *>(DefaultValue));        \
   } while (0)
 
   // Buffer overflow.
@@ -257,7 +261,9 @@ TEST(LEB128Test, DecodeInvalidSLEB128) {
 TEST(LEB128Test, DecodeAndInc) {
 #define EXPECT_LEB128(FUN, VALUE, SIZE)                                        \
   do {                                                                         \
-    const uint8_t *V = reinterpret_cast<const uint8_t *>(VALUE), *P = V;       \
+    const char *DefaultValue = VALUE;                                          \
+    const uint8_t *V = reinterpret_cast<const uint8_t *>(DefaultValue),        \
+                  *P = V;                                                      \
     auto Expected = FUN(P), Actual = FUN##AndInc(P, P + strlen(VALUE));        \
     EXPECT_EQ(Actual, Expected);                                               \
     EXPECT_EQ(P - V, SIZE);                                                    \
diff --git a/llvm/unittests/Support/raw_socket_stream_test.cpp b/llvm/unittests/Support/raw_socket_stream_test.cpp
index a8536228666d..c4e8cfbbe7e6 100644
--- a/llvm/unittests/Support/raw_socket_stream_test.cpp
+++ b/llvm/unittests/Support/raw_socket_stream_test.cpp
@@ -7,7 +7,6 @@
 #include "llvm/Testing/Support/Error.h"
 #include "gtest/gtest.h"
 #include <future>
-#include <iostream>
 #include <stdlib.h>
 #include <thread>
 
@@ -86,13 +85,8 @@ TEST(raw_socket_streamTest, TIMEOUT_PROVIDED) {
   std::chrono::milliseconds Timeout = std::chrono::milliseconds(100);
   Expected<std::unique_ptr<raw_socket_stream>> MaybeServer =
       ServerListener.accept(Timeout);
-
-  ASSERT_THAT_EXPECTED(MaybeServer, Failed());
-  llvm::Error Err = MaybeServer.takeError();
-  llvm::handleAllErrors(std::move(Err), [&](const llvm::StringError &SE) {
-    std::error_code EC = SE.convertToErrorCode();
-    ASSERT_EQ(EC, std::errc::timed_out);
-  });
+  ASSERT_EQ(llvm::errorToErrorCode(MaybeServer.takeError()),
+            std::errc::timed_out);
 }
 
 TEST(raw_socket_streamTest, FILE_DESCRIPTOR_CLOSED) {
@@ -122,12 +116,7 @@ TEST(raw_socket_streamTest, FILE_DESCRIPTOR_CLOSED) {
 
   // Wait for the CloseThread to finish
   CloseThread.join();
-
-  ASSERT_THAT_EXPECTED(MaybeServer, Failed());
-  llvm::Error Err = MaybeServer.takeError();
-  llvm::handleAllErrors(std::move(Err), [&](const llvm::StringError &SE) {
-    std::error_code EC = SE.convertToErrorCode();
-    ASSERT_EQ(EC, std::errc::operation_canceled);
-  });
+  ASSERT_EQ(llvm::errorToErrorCode(MaybeServer.takeError()),
+            std::errc::operation_canceled);
 }
 } // namespace
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 0455e061f0bf..797d7dfbca20 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1996,7 +1996,6 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
       AArch64::AEK_D128,         AArch64::AEK_LSE128,
       AArch64::AEK_SPECRES2,     AArch64::AEK_RASV2,
       AArch64::AEK_ITE,          AArch64::AEK_GCS,
-      AArch64::AEK_FPMR,         AArch64::AEK_FP8,
       AArch64::AEK_FAMINMAX,     AArch64::AEK_FP8FMA,
       AArch64::AEK_SSVE_FP8FMA,  AArch64::AEK_FP8DOT2,
       AArch64::AEK_SSVE_FP8DOT2, AArch64::AEK_FP8DOT4,
@@ -2005,7 +2004,8 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
       AArch64::AEK_SMEF8F32,     AArch64::AEK_SMEFA64,
       AArch64::AEK_CPA,          AArch64::AEK_PAUTHLR,
       AArch64::AEK_TLBIW,        AArch64::AEK_JSCVT,
-      AArch64::AEK_FCMA,
+      AArch64::AEK_FCMA,         AArch64::AEK_FP8,
+
   };
 
   std::vector<StringRef> Features;
@@ -2078,7 +2078,6 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   EXPECT_TRUE(llvm::is_contained(Features, "+specres2"));
   EXPECT_TRUE(llvm::is_contained(Features, "+ite"));
   EXPECT_TRUE(llvm::is_contained(Features, "+gcs"));
-  EXPECT_TRUE(llvm::is_contained(Features, "+fpmr"));
   EXPECT_TRUE(llvm::is_contained(Features, "+fp8"));
   EXPECT_TRUE(llvm::is_contained(Features, "+faminmax"));
   EXPECT_TRUE(llvm::is_contained(Features, "+fp8fma"));
@@ -2224,7 +2223,6 @@ TEST(TargetParserTest, AArch64ArchExtFeature) {
       {"predres2", "nopredres2", "+specres2", "-specres2"},
       {"rasv2", "norasv2", "+rasv2", "-rasv2"},
       {"gcs", "nogcs", "+gcs", "-gcs"},
-      {"fpmr", "nofpmr", "+fpmr", "-fpmr"},
       {"fp8", "nofp8", "+fp8", "-fp8"},
       {"faminmax", "nofaminmax", "+faminmax", "-faminmax"},
       {"fp8fma", "nofp8fma", "+fp8fma", "-fp8fma"},
diff --git a/llvm/unittests/tools/llvm-mca/MCATestBase.cpp b/llvm/unittests/tools/llvm-mca/MCATestBase.cpp
index 4f444fae3d4c..4a39f5e663f2 100644
--- a/llvm/unittests/tools/llvm-mca/MCATestBase.cpp
+++ b/llvm/unittests/tools/llvm-mca/MCATestBase.cpp
@@ -66,7 +66,7 @@ Error MCATestBase::runBaselineMCA(json::Object &Result, ArrayRef<MCInst> Insts,
 
   // Default InstrumentManager
   auto IM = std::make_unique<mca::InstrumentManager>(*STI, *MCII);
-  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM);
+  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM, /*CallLatency=*/100);
 
   const SmallVector<mca::Instrument *> Instruments;
   SmallVector<std::unique_ptr<mca::Instruction>> LoweredInsts;
diff --git a/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp b/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp
index 00a44dc1bab1..ac35dce522ae 100644
--- a/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp
+++ b/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp
@@ -33,7 +33,7 @@ TEST_F(X86TestBase, TestResumablePipeline) {
   P->addEventListener(SV.get());
 
   auto IM = std::make_unique<mca::InstrumentManager>(*STI, *MCII);
-  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM);
+  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM, /*CallLatency=*/100);
 
   const SmallVector<mca::Instrument *> Instruments;
   // Tile size = 7
@@ -124,7 +124,7 @@ TEST_F(X86TestBase, TestInstructionRecycling) {
   // Default InstrumentManager
   auto IM = std::make_unique<mca::InstrumentManager>(*STI, *MCII);
 
-  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM);
+  mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM, /*CallLatency=*/100);
   IB.setInstRecycleCallback(GetRecycledInst);
 
   const SmallVector<mca::Instrument *> Instruments;
diff --git a/llvm/utils/TableGen/Common/CMakeLists.txt b/llvm/utils/TableGen/Common/CMakeLists.txt
index 699583f70323..13883aa8fa39 100644
--- a/llvm/utils/TableGen/Common/CMakeLists.txt
+++ b/llvm/utils/TableGen/Common/CMakeLists.txt
@@ -40,6 +40,7 @@ add_llvm_library(LLVMTableGenCommon STATIC OBJECT EXCLUDE_FROM_ALL
 
   DEPENDS
   vt_gen
+  intrinsics_gen
   )
 
 # Users may include its headers as "Common/*.h"
diff --git a/llvm/utils/TableGen/Common/CodeGenTarget.cpp b/llvm/utils/TableGen/Common/CodeGenTarget.cpp
index e1cf33e7f62f..bc3ccd888cb4 100644
--- a/llvm/utils/TableGen/Common/CodeGenTarget.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenTarget.cpp
@@ -63,212 +63,9 @@ StringRef llvm::getName(MVT::SimpleValueType T) {
 StringRef llvm::getEnumName(MVT::SimpleValueType T) {
   // clang-format off
   switch (T) {
-  case MVT::Other:    return "MVT::Other";
-  case MVT::i1:       return "MVT::i1";
-  case MVT::i2:       return "MVT::i2";
-  case MVT::i4:       return "MVT::i4";
-  case MVT::i8:       return "MVT::i8";
-  case MVT::i16:      return "MVT::i16";
-  case MVT::i32:      return "MVT::i32";
-  case MVT::i64:      return "MVT::i64";
-  case MVT::i128:     return "MVT::i128";
-  case MVT::Any:      return "MVT::Any";
-  case MVT::iAny:     return "MVT::iAny";
-  case MVT::fAny:     return "MVT::fAny";
-  case MVT::vAny:     return "MVT::vAny";
-  case MVT::f16:      return "MVT::f16";
-  case MVT::bf16:     return "MVT::bf16";
-  case MVT::f32:      return "MVT::f32";
-  case MVT::f64:      return "MVT::f64";
-  case MVT::f80:      return "MVT::f80";
-  case MVT::f128:     return "MVT::f128";
-  case MVT::ppcf128:  return "MVT::ppcf128";
-  case MVT::x86mmx:   return "MVT::x86mmx";
-  case MVT::x86amx:   return "MVT::x86amx";
-  case MVT::aarch64svcount:   return "MVT::aarch64svcount";
-  case MVT::i64x8:    return "MVT::i64x8";
-  case MVT::Glue:     return "MVT::Glue";
-  case MVT::isVoid:   return "MVT::isVoid";
-  case MVT::v1i1:     return "MVT::v1i1";
-  case MVT::v2i1:     return "MVT::v2i1";
-  case MVT::v3i1:     return "MVT::v3i1";
-  case MVT::v4i1:     return "MVT::v4i1";
-  case MVT::v8i1:     return "MVT::v8i1";
-  case MVT::v16i1:    return "MVT::v16i1";
-  case MVT::v32i1:    return "MVT::v32i1";
-  case MVT::v64i1:    return "MVT::v64i1";
-  case MVT::v128i1:   return "MVT::v128i1";
-  case MVT::v256i1:   return "MVT::v256i1";
-  case MVT::v512i1:   return "MVT::v512i1";
-  case MVT::v1024i1:  return "MVT::v1024i1";
-  case MVT::v2048i1:  return "MVT::v2048i1";
-  case MVT::v128i2:   return "MVT::v128i2";
-  case MVT::v256i2:   return "MVT::v256i2";
-  case MVT::v64i4:    return "MVT::v64i4";
-  case MVT::v128i4:   return "MVT::v128i4";
-  case MVT::v1i8:     return "MVT::v1i8";
-  case MVT::v2i8:     return "MVT::v2i8";
-  case MVT::v3i8:     return "MVT::v3i8";
-  case MVT::v4i8:     return "MVT::v4i8";
-  case MVT::v8i8:     return "MVT::v8i8";
-  case MVT::v16i8:    return "MVT::v16i8";
-  case MVT::v32i8:    return "MVT::v32i8";
-  case MVT::v64i8:    return "MVT::v64i8";
-  case MVT::v128i8:   return "MVT::v128i8";
-  case MVT::v256i8:   return "MVT::v256i8";
-  case MVT::v512i8:   return "MVT::v512i8";
-  case MVT::v1024i8:  return "MVT::v1024i8";
-  case MVT::v1i16:    return "MVT::v1i16";
-  case MVT::v2i16:    return "MVT::v2i16";
-  case MVT::v3i16:    return "MVT::v3i16";
-  case MVT::v4i16:    return "MVT::v4i16";
-  case MVT::v8i16:    return "MVT::v8i16";
-  case MVT::v16i16:   return "MVT::v16i16";
-  case MVT::v32i16:   return "MVT::v32i16";
-  case MVT::v64i16:   return "MVT::v64i16";
-  case MVT::v128i16:  return "MVT::v128i16";
-  case MVT::v256i16:  return "MVT::v256i16";
-  case MVT::v512i16:  return "MVT::v512i16";
-  case MVT::v1i32:    return "MVT::v1i32";
-  case MVT::v2i32:    return "MVT::v2i32";
-  case MVT::v3i32:    return "MVT::v3i32";
-  case MVT::v4i32:    return "MVT::v4i32";
-  case MVT::v5i32:    return "MVT::v5i32";
-  case MVT::v6i32:    return "MVT::v6i32";
-  case MVT::v7i32:    return "MVT::v7i32";
-  case MVT::v8i32:    return "MVT::v8i32";
-  case MVT::v9i32:    return "MVT::v9i32";
-  case MVT::v10i32:   return "MVT::v10i32";
-  case MVT::v11i32:   return "MVT::v11i32";
-  case MVT::v12i32:   return "MVT::v12i32";
-  case MVT::v16i32:   return "MVT::v16i32";
-  case MVT::v32i32:   return "MVT::v32i32";
-  case MVT::v64i32:   return "MVT::v64i32";
-  case MVT::v128i32:  return "MVT::v128i32";
-  case MVT::v256i32:  return "MVT::v256i32";
-  case MVT::v512i32:  return "MVT::v512i32";
-  case MVT::v1024i32: return "MVT::v1024i32";
-  case MVT::v2048i32: return "MVT::v2048i32";
-  case MVT::v1i64:    return "MVT::v1i64";
-  case MVT::v2i64:    return "MVT::v2i64";
-  case MVT::v3i64:    return "MVT::v3i64";
-  case MVT::v4i64:    return "MVT::v4i64";
-  case MVT::v8i64:    return "MVT::v8i64";
-  case MVT::v16i64:   return "MVT::v16i64";
-  case MVT::v32i64:   return "MVT::v32i64";
-  case MVT::v64i64:   return "MVT::v64i64";
-  case MVT::v128i64:  return "MVT::v128i64";
-  case MVT::v256i64:  return "MVT::v256i64";
-  case MVT::v1i128:   return "MVT::v1i128";
-  case MVT::v1f16:    return "MVT::v1f16";
-  case MVT::v2f16:    return "MVT::v2f16";
-  case MVT::v3f16:    return "MVT::v3f16";
-  case MVT::v4f16:    return "MVT::v4f16";
-  case MVT::v8f16:    return "MVT::v8f16";
-  case MVT::v16f16:   return "MVT::v16f16";
-  case MVT::v32f16:   return "MVT::v32f16";
-  case MVT::v64f16:   return "MVT::v64f16";
-  case MVT::v128f16:  return "MVT::v128f16";
-  case MVT::v256f16:  return "MVT::v256f16";
-  case MVT::v512f16:  return "MVT::v512f16";
-  case MVT::v2bf16:   return "MVT::v2bf16";
-  case MVT::v3bf16:   return "MVT::v3bf16";
-  case MVT::v4bf16:   return "MVT::v4bf16";
-  case MVT::v8bf16:   return "MVT::v8bf16";
-  case MVT::v16bf16:  return "MVT::v16bf16";
-  case MVT::v32bf16:  return "MVT::v32bf16";
-  case MVT::v64bf16:  return "MVT::v64bf16";
-  case MVT::v128bf16: return "MVT::v128bf16";
-  case MVT::v1f32:    return "MVT::v1f32";
-  case MVT::v2f32:    return "MVT::v2f32";
-  case MVT::v3f32:    return "MVT::v3f32";
-  case MVT::v4f32:    return "MVT::v4f32";
-  case MVT::v5f32:    return "MVT::v5f32";
-  case MVT::v6f32:    return "MVT::v6f32";
-  case MVT::v7f32:    return "MVT::v7f32";
-  case MVT::v8f32:    return "MVT::v8f32";
-  case MVT::v9f32:    return "MVT::v9f32";
-  case MVT::v10f32:   return "MVT::v10f32";
-  case MVT::v11f32:   return "MVT::v11f32";
-  case MVT::v12f32:   return "MVT::v12f32";
-  case MVT::v16f32:   return "MVT::v16f32";
-  case MVT::v32f32:   return "MVT::v32f32";
-  case MVT::v64f32:   return "MVT::v64f32";
-  case MVT::v128f32:  return "MVT::v128f32";
-  case MVT::v256f32:  return "MVT::v256f32";
-  case MVT::v512f32:  return "MVT::v512f32";
-  case MVT::v1024f32: return "MVT::v1024f32";
-  case MVT::v2048f32: return "MVT::v2048f32";
-  case MVT::v1f64:    return "MVT::v1f64";
-  case MVT::v2f64:    return "MVT::v2f64";
-  case MVT::v3f64:    return "MVT::v3f64";
-  case MVT::v4f64:    return "MVT::v4f64";
-  case MVT::v8f64:    return "MVT::v8f64";
-  case MVT::v16f64:   return "MVT::v16f64";
-  case MVT::v32f64:   return "MVT::v32f64";
-  case MVT::v64f64:   return "MVT::v64f64";
-  case MVT::v128f64:  return "MVT::v128f64";
-  case MVT::v256f64:  return "MVT::v256f64";
-  case MVT::nxv1i1:   return "MVT::nxv1i1";
-  case MVT::nxv2i1:   return "MVT::nxv2i1";
-  case MVT::nxv4i1:   return "MVT::nxv4i1";
-  case MVT::nxv8i1:   return "MVT::nxv8i1";
-  case MVT::nxv16i1:  return "MVT::nxv16i1";
-  case MVT::nxv32i1:  return "MVT::nxv32i1";
-  case MVT::nxv64i1:  return "MVT::nxv64i1";
-  case MVT::nxv1i8:   return "MVT::nxv1i8";
-  case MVT::nxv2i8:   return "MVT::nxv2i8";
-  case MVT::nxv4i8:   return "MVT::nxv4i8";
-  case MVT::nxv8i8:   return "MVT::nxv8i8";
-  case MVT::nxv16i8:  return "MVT::nxv16i8";
-  case MVT::nxv32i8:  return "MVT::nxv32i8";
-  case MVT::nxv64i8:  return "MVT::nxv64i8";
-  case MVT::nxv1i16:  return "MVT::nxv1i16";
-  case MVT::nxv2i16:  return "MVT::nxv2i16";
-  case MVT::nxv4i16:  return "MVT::nxv4i16";
-  case MVT::nxv8i16:  return "MVT::nxv8i16";
-  case MVT::nxv16i16: return "MVT::nxv16i16";
-  case MVT::nxv32i16: return "MVT::nxv32i16";
-  case MVT::nxv1i32:  return "MVT::nxv1i32";
-  case MVT::nxv2i32:  return "MVT::nxv2i32";
-  case MVT::nxv4i32:  return "MVT::nxv4i32";
-  case MVT::nxv8i32:  return "MVT::nxv8i32";
-  case MVT::nxv16i32: return "MVT::nxv16i32";
-  case MVT::nxv32i32: return "MVT::nxv32i32";
-  case MVT::nxv1i64:  return "MVT::nxv1i64";
-  case MVT::nxv2i64:  return "MVT::nxv2i64";
-  case MVT::nxv4i64:  return "MVT::nxv4i64";
-  case MVT::nxv8i64:  return "MVT::nxv8i64";
-  case MVT::nxv16i64: return "MVT::nxv16i64";
-  case MVT::nxv32i64: return "MVT::nxv32i64";
-  case MVT::nxv1f16:  return "MVT::nxv1f16";
-  case MVT::nxv2f16:  return "MVT::nxv2f16";
-  case MVT::nxv4f16:  return "MVT::nxv4f16";
-  case MVT::nxv8f16:  return "MVT::nxv8f16";
-  case MVT::nxv16f16: return "MVT::nxv16f16";
-  case MVT::nxv32f16: return "MVT::nxv32f16";
-  case MVT::nxv1bf16:  return "MVT::nxv1bf16";
-  case MVT::nxv2bf16:  return "MVT::nxv2bf16";
-  case MVT::nxv4bf16:  return "MVT::nxv4bf16";
-  case MVT::nxv8bf16:  return "MVT::nxv8bf16";
-  case MVT::nxv16bf16: return "MVT::nxv16bf16";
-  case MVT::nxv32bf16: return "MVT::nxv32bf16";
-  case MVT::nxv1f32:   return "MVT::nxv1f32";
-  case MVT::nxv2f32:   return "MVT::nxv2f32";
-  case MVT::nxv4f32:   return "MVT::nxv4f32";
-  case MVT::nxv8f32:   return "MVT::nxv8f32";
-  case MVT::nxv16f32:  return "MVT::nxv16f32";
-  case MVT::nxv1f64:   return "MVT::nxv1f64";
-  case MVT::nxv2f64:   return "MVT::nxv2f64";
-  case MVT::nxv4f64:   return "MVT::nxv4f64";
-  case MVT::nxv8f64:   return "MVT::nxv8f64";
-  case MVT::token:     return "MVT::token";
-  case MVT::Metadata:  return "MVT::Metadata";
-  case MVT::iPTR:      return "MVT::iPTR";
-  case MVT::iPTRAny:   return "MVT::iPTRAny";
-  case MVT::Untyped:   return "MVT::Untyped";
-  case MVT::funcref:   return "MVT::funcref";
-  case MVT::externref: return "MVT::externref";
+#define GET_VT_ATTR(Ty, N, Sz, Any, Int, FP, Vec, Sc)                          \
+  case MVT::Ty: return "MVT::" # Ty;
+#include "llvm/CodeGen/GenVT.inc"
   default: llvm_unreachable("ILLEGAL VALUE TYPE!");
   }
   // clang-format on
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index 7da16e0f0cb2..85c129488d95 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -1041,7 +1041,7 @@ class NamelessValue:
         var = var.replace("-", "_")
         return var.upper()
 
-    def get_affixes_from_match(self, match: re.Match):
+    def get_affixes_from_match(self, match):
         prefix = re.match(self.ir_prefix, match.group(2)).group(0)
         suffix = re.search(self.ir_suffix + "$", match.group(2)).group(0)
         return prefix, suffix
diff --git a/llvm/utils/gn/README.rst b/llvm/utils/gn/README.rst
index 9ca545061099..52d03be533e5 100644
--- a/llvm/utils/gn/README.rst
+++ b/llvm/utils/gn/README.rst
@@ -131,7 +131,7 @@ configure is used for three classes of feature checks:
 
 For the last two points, it would be nice if LLVM didn't have a single
 ``config.h`` header, but one header per toggle. That way, when e.g.
-``llvm_enable_terminfo`` is toggled, only the 3 files caring about that setting
+``llvm_enable_zlib`` is toggled, only the 3 files caring about that setting
 would need to be rebuilt, instead of everything including ``config.h``.
 
 GN doesn't believe in users setting arbitrary cflags from an environment
diff --git a/llvm/utils/gn/build/libs/terminfo/BUILD.gn b/llvm/utils/gn/build/libs/terminfo/BUILD.gn
deleted file mode 100644
index 10003d61c4df..000000000000
--- a/llvm/utils/gn/build/libs/terminfo/BUILD.gn
+++ /dev/null
@@ -1,12 +0,0 @@
-import("//llvm/utils/gn/build/libs/terminfo/enable.gni")
-
-config("terminfo_config") {
-  visibility = [ ":terminfo" ]
-  libs = [ "ncurses" ]
-}
-
-group("terminfo") {
-  if (llvm_enable_terminfo) {
-    public_configs = [ ":terminfo_config" ]
-  }
-}
diff --git a/llvm/utils/gn/build/libs/terminfo/enable.gni b/llvm/utils/gn/build/libs/terminfo/enable.gni
deleted file mode 100644
index 79ea2b601857..000000000000
--- a/llvm/utils/gn/build/libs/terminfo/enable.gni
+++ /dev/null
@@ -1,4 +0,0 @@
-declare_args() {
-  # Whether to link against terminfo.
-  llvm_enable_terminfo = false
-}
diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
index 0a7cc3854056..c312c86fa164 100644
--- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
@@ -140,12 +140,10 @@ copy("Headers") {
     "avx512bwintrin.h",
     "avx512cdintrin.h",
     "avx512dqintrin.h",
-    "avx512erintrin.h",
     "avx512fintrin.h",
     "avx512fp16intrin.h",
     "avx512ifmaintrin.h",
     "avx512ifmavlintrin.h",
-    "avx512pfintrin.h",
     "avx512vbmi2intrin.h",
     "avx512vbmiintrin.h",
     "avx512vbmivlintrin.h",
diff --git a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
index 188c71805f27..9075ada55c0f 100644
--- a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn
@@ -95,6 +95,7 @@ static_library("Sema") {
     "SemaTemplateInstantiateDecl.cpp",
     "SemaTemplateVariadic.cpp",
     "SemaType.cpp",
+    "SemaX86.cpp",
     "TypeLocBuilder.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn
index da48149c4d90..3ae50b214eb1 100644
--- a/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Checkers/BUILD.gn
@@ -104,6 +104,7 @@ static_library("Checkers") {
     "PointerSortingChecker.cpp",
     "PointerSubChecker.cpp",
     "PthreadLockChecker.cpp",
+    "PutenvStackArrayChecker.cpp",
     "RetainCountChecker/RetainCountChecker.cpp",
     "RetainCountChecker/RetainCountDiagnostics.cpp",
     "ReturnPointerRangeChecker.cpp",
@@ -111,6 +112,7 @@ static_library("Checkers") {
     "ReturnValueChecker.cpp",
     "RunLoopAutoreleaseLeakChecker.cpp",
     "STLAlgorithmModeling.cpp",
+    "SetgidSetuidOrderChecker.cpp",
     "SimpleStreamChecker.cpp",
     "SmartPtrChecker.cpp",
     "SmartPtrModeling.cpp",
@@ -147,6 +149,5 @@ static_library("Checkers") {
     "WebKit/UncountedLambdaCapturesChecker.cpp",
     "WebKit/UncountedLocalVarsChecker.cpp",
     "cert/InvalidPtrChecker.cpp",
-    "cert/PutenvWithAutoChecker.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 210b26e8f166..c51e4bf037db 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -294,6 +294,7 @@ if (current_toolchain == default_toolchain) {
       "__atomic/atomic_flag.h",
       "__atomic/atomic_init.h",
       "__atomic/atomic_lock_free.h",
+      "__atomic/atomic_ref.h",
       "__atomic/atomic_sync.h",
       "__atomic/check_memory_order.h",
       "__atomic/contention_t.h",
@@ -302,6 +303,7 @@ if (current_toolchain == default_toolchain) {
       "__atomic/is_always_lock_free.h",
       "__atomic/kill_dependency.h",
       "__atomic/memory_order.h",
+      "__atomic/to_gcc_order.h",
       "__availability",
       "__bit/bit_cast.h",
       "__bit/bit_ceil.h",
diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
index 80a91507fcc6..e93130eacdc7 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -10,7 +10,6 @@ import("//llvm/utils/gn/build/buildflags.gni")
 import("//llvm/utils/gn/build/libs/curl/enable.gni")
 import("//llvm/utils/gn/build/libs/edit/enable.gni")
 import("//llvm/utils/gn/build/libs/pthread/enable.gni")
-import("//llvm/utils/gn/build/libs/terminfo/enable.gni")
 import("//llvm/utils/gn/build/libs/xar/enable.gni")
 import("//llvm/utils/gn/build/libs/xml/enable.gni")
 import("//llvm/utils/gn/build/libs/zlib/enable.gni")
@@ -294,12 +293,6 @@ write_cmake_config("config") {
     values += [ "HAVE_LIBEDIT=" ]
   }
 
-  if (llvm_enable_terminfo) {
-    values += [ "LLVM_ENABLE_TERMINFO=1" ]
-  } else {
-    values += [ "LLVM_ENABLE_TERMINFO=" ]
-  }
-
   if (llvm_enable_libxml2) {
     values += [ "LLVM_ENABLE_LIBXML2=1" ]
   } else {
diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
index 941d448b3367..7728455499bf 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
@@ -6,7 +6,6 @@ static_library("Support") {
     "//llvm/include/llvm/Support:write_vcsrevision",
     "//llvm/lib/Demangle",
     "//llvm/utils/gn/build/libs/pthread",
-    "//llvm/utils/gn/build/libs/terminfo",
     "//llvm/utils/gn/build/libs/zlib",
   ]
 
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-config/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-config/BUILD.gn
index bf50cd0fce46..711e4e3b4315 100644
--- a/llvm/utils/gn/secondary/llvm/tools/llvm-config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-config/BUILD.gn
@@ -1,7 +1,6 @@
 import("//llvm/lib/Target/targets_string.gni")
 import("//llvm/utils/gn/build/buildflags.gni")
 import("//llvm/utils/gn/build/libs/pthread/enable.gni")
-import("//llvm/utils/gn/build/libs/terminfo/enable.gni")
 import("//llvm/utils/gn/build/libs/xml/enable.gni")
 import("//llvm/utils/gn/build/libs/zlib/enable.gni")
 import("//llvm/utils/gn/build/write_cmake_config.gni")
@@ -36,7 +35,7 @@ write_cmake_config("BuildVariables.inc") {
     lib = ""
   }
 
-  # Windows doesn't use any of libxml2, terminfo, zlib by default.
+  # Windows doesn't use any of libxml2, zlib by default.
   # Make GN not warn about these variables being unused.
   not_needed([
                "l",
@@ -63,9 +62,6 @@ write_cmake_config("BuildVariables.inc") {
   if (llvm_enable_libxml2) {
     system_libs += " ${l}xml2${lib}"
   }
-  if (llvm_enable_terminfo) {
-    system_libs += " ${l}ncurses${lib}"
-  }
   if (llvm_enable_zlib) {
     system_libs += " ${l}z${lib}"
   }
diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index 4c0ef8387b8d..9f0b0d68b816 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -110,14 +110,6 @@ else()
   set(MLIR_ENABLE_EXECUTION_ENGINE 0)
 endif()
 
-# Build the CUDA conversions and run according tests if the NVPTX backend
-# is available
-if ("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
-  set(MLIR_ENABLE_CUDA_CONVERSIONS 1)
-else()
-  set(MLIR_ENABLE_CUDA_CONVERSIONS 0)
-endif()
-
 # Build the ROCm conversions and run according tests if the AMDGPU backend
 # is available.
 if ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)
diff --git a/mlir/docs/PassManagement.md b/mlir/docs/PassManagement.md
index e9ecb99462b8..7b19a7bf6bf4 100644
--- a/mlir/docs/PassManagement.md
+++ b/mlir/docs/PassManagement.md
@@ -1359,6 +1359,45 @@ func.func @simple_constant() -> (i32, i32) {
 }
 ```
 
+*   `mlir-print-ir-tree-dir=(directory path)`
+    *   Without setting this option, the IR printed by the instrumentation will
+        be printed to `stderr`. If you provide a directory using this option,
+        the output corresponding to each pass will be printed to a file in the
+        directory tree rooted at `(directory path)`. The path created for each
+        pass reflects the nesting structure of the IR and the pass pipeline.
+    *   The below example illustrates the file tree created by running a pass
+        pipeline on IR that has two `func.func` located within two nested
+        `builtin.module` ops.
+    *   The subdirectories are given names that reflect the parent op names and
+        the symbol names for those ops (if present).
+    *   The printer keeps a counter associated with ops that are targeted by
+        passes and their isolated-from-above parents. Each filename is given a
+        numeric prefix using the counter value for the op that the pass is
+        targeting. The counter values for each parent are then prepended. This
+        gives a naming where it is easy to distinguish which passes may have run
+        concurrently versus which have a clear ordering. In the below example,for
+        both `1_1_pass4.mlir` files, the first 1 refers to the counter for the
+        parent op, and the second refers to the counter for the respective
+        function.
+
+```
+$ pipeline="builtin.module(pass1,pass2,func.func(pass3,pass4),pass5)"
+$ mlir-opt foo.mlir -pass-pipeline="$pipeline" -mlir-print-ir-tree-dir=/tmp/pipeline_output
+$ tree /tmp/pipeline_output
+
+/tmp/pass_output
+├── builtin_module_the_symbol_name
+│   ├── 0_pass1.mlir
+│   ├── 1_pass2.mlir
+│   ├── 2_pass5.mlir
+│   ├── func_func_my_func_name
+│   │   ├── 1_0_pass3.mlir
+│   │   ├── 1_1_pass4.mlir
+│   ├── func_func_my_other_func_name
+│   │   ├── 1_0_pass3.mlir
+│   │   ├── 1_1_pass4.mlir
+```
+
 ## Crash and Failure Reproduction
 
 The [pass manager](#pass-manager) in MLIR contains a builtin mechanism to
diff --git a/mlir/include/mlir-c/Debug.h b/mlir/include/mlir-c/Debug.h
index 2502f2fa23bf..7dad73500858 100644
--- a/mlir/include/mlir-c/Debug.h
+++ b/mlir/include/mlir-c/Debug.h
@@ -21,6 +21,19 @@ MLIR_CAPI_EXPORTED void mlirEnableGlobalDebug(bool enable);
 /// Retuns `true` if the global debugging flag is set, false otherwise.
 MLIR_CAPI_EXPORTED bool mlirIsGlobalDebugEnabled();
 
+/// Sets the current debug type, similarly to `-debug-only=type` in the
+/// command-line tools. Note that global debug should be enabled for any output
+/// to be produced.
+MLIR_CAPI_EXPORTED void mlirSetGlobalDebugType(const char *type);
+
+/// Sets multiple current debug types, similarly to `-debug-only=type1,type2" in
+/// the command-line tools. Note that global debug should be enabled for any
+/// output to be produced.
+MLIR_CAPI_EXPORTED void mlirSetGlobalDebugTypes(const char **types, intptr_t n);
+
+/// Checks if `type` is set as the current debug type.
+MLIR_CAPI_EXPORTED bool mlirIsCurrentDebugType(const char *type);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/mlir/include/mlir/Analysis/SliceAnalysis.h b/mlir/include/mlir/Analysis/SliceAnalysis.h
index d5cdf72c3889..99279fdfe427 100644
--- a/mlir/include/mlir/Analysis/SliceAnalysis.h
+++ b/mlir/include/mlir/Analysis/SliceAnalysis.h
@@ -223,11 +223,6 @@ SetVector<Operation *>
 getSlice(Operation *op, const BackwardSliceOptions &backwardSliceOptions = {},
          const ForwardSliceOptions &forwardSliceOptions = {});
 
-/// Multi-root DAG topological sort.
-/// Performs a topological sort of the Operation in the `toSort` SetVector.
-/// Returns a topologically sorted SetVector.
-SetVector<Operation *> topologicalSort(const SetVector<Operation *> &toSort);
-
 /// Utility to match a generic reduction given a list of iteration-carried
 /// arguments, `iterCarriedArgs` and the position of the potential reduction
 /// argument within the list, `redPos`. If a reduction is matched, returns the
diff --git a/mlir/include/mlir/Transforms/TopologicalSortUtils.h b/mlir/include/mlir/Analysis/TopologicalSortUtils.h
index 74e44b1dc485..ee98cd8cb380 100644
--- a/mlir/include/mlir/Transforms/TopologicalSortUtils.h
+++ b/mlir/include/mlir/Analysis/TopologicalSortUtils.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_TRANSFORMS_TOPOLOGICALSORTUTILS_H
-#define MLIR_TRANSFORMS_TOPOLOGICALSORTUTILS_H
+#ifndef MLIR_ANALYSIS_TOPOLOGICALSORTUTILS_H
+#define MLIR_ANALYSIS_TOPOLOGICALSORTUTILS_H
 
 #include "mlir/IR/Block.h"
 
@@ -104,6 +104,14 @@ bool computeTopologicalSorting(
     MutableArrayRef<Operation *> ops,
     function_ref<bool(Value, Operation *)> isOperandReady = nullptr);
 
+/// Gets a list of blocks that is sorted according to dominance. This sort is
+/// stable.
+SetVector<Block *> getBlocksSortedByDominance(Region &region);
+
+/// Sorts all operations in `toSort` topologically while also considering region
+/// semantics. Does not support multi-sets.
+SetVector<Operation *> topologicalSort(const SetVector<Operation *> &toSort);
+
 } // end namespace mlir
 
-#endif // MLIR_TRANSFORMS_TOPOLOGICALSORTUTILS_H
+#endif // MLIR_ANALYSIS_TOPOLOGICALSORTUTILS_H
diff --git a/mlir/include/mlir/Config/mlir-config.h.cmake b/mlir/include/mlir/Config/mlir-config.h.cmake
index 9339ce07bdfd..abd6f41b42ff 100644
--- a/mlir/include/mlir/Config/mlir-config.h.cmake
+++ b/mlir/include/mlir/Config/mlir-config.h.cmake
@@ -39,10 +39,6 @@
 /* If set, enables PDL usage. */
 #cmakedefine01 MLIR_ENABLE_PDL_IN_PATTERNMATCH
 
-/* If set, enables CUDA-related features in CUDA-related transforms, pipelines,
-   and targets. */
-#cmakedefine01 MLIR_ENABLE_CUDA_CONVERSIONS
-
 /* If set, enables features that depend on the NVIDIA's PTX compiler. */
 #cmakedefine01 MLIR_ENABLE_NVPTXCOMPILER
 
diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
index 4e4c6fd60177..ead52332e8ee 100644
--- a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
+++ b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
@@ -1540,6 +1540,18 @@ def Arith_CmpFOp : Arith_CompareOp<"cmpf",
 // SelectOp
 //===----------------------------------------------------------------------===//
 
+class AnyBooleanTypeMatch<list<string> names> :
+    AnyMatchOperatorTrait<names, "$_self.getType().isSignlessInteger(1)",
+                          "scalar type">;
+
+class ScalarConditionOrMatchingShape<list<string> names> :
+    PredOpTrait<
+        !head(names) # " is scalar or has matching shape",
+        Or<[AnyBooleanTypeMatch<[!head(names)]>.predicate,
+            AllShapesMatch<names>.predicate]>> {
+  list<string> values = names;
+}
+
 def SelectOp : Arith_Op<"select", [Pure,
     AllTypesMatch<["true_value", "false_value", "result"]>,
     ScalarConditionOrMatchingShape<["condition", "result"]>,
@@ -1548,16 +1560,16 @@ def SelectOp : Arith_Op<"select", [Pure,
   let summary = "select operation";
   let description = [{
     The `arith.select` operation chooses one value based on a binary condition
-    supplied as its first operand. 
-    
-    If the value of the first operand (the condition) is `1`, then the second 
-    operand is returned, and the third operand is ignored, even if it was poison. 
-    
-    If the value of the first operand (the condition) is `0`, then the third 
-    operand is returned, and the second operand is ignored, even if it was poison. 
-    
-    If the value of the first operand (the condition) is poison, then the 
-    operation returns poison. 
+    supplied as its first operand.
+
+    If the value of the first operand (the condition) is `1`, then the second
+    operand is returned, and the third operand is ignored, even if it was poison.
+
+    If the value of the first operand (the condition) is `0`, then the third
+    operand is returned, and the second operand is ignored, even if it was poison.
+
+    If the value of the first operand (the condition) is poison, then the
+    operation returns poison.
 
     The operation applies to vectors and tensors elementwise given the _shape_
     of all operands is identical. The choice is made for each element
diff --git a/mlir/include/mlir/Dialect/CommonFolders.h b/mlir/include/mlir/Dialect/CommonFolders.h
index 7dabc781cd59..6f497a259262 100644
--- a/mlir/include/mlir/Dialect/CommonFolders.h
+++ b/mlir/include/mlir/Dialect/CommonFolders.h
@@ -298,7 +298,10 @@ Attribute constFoldCastOp(ArrayRef<Attribute> operands, Type resType,
         calculate(op.getSplatValue<ElementValueT>(), castStatus);
     if (!castStatus)
       return {};
-    return DenseElementsAttr::get(cast<ShapedType>(resType), elementResult);
+    auto shapedResType = cast<ShapedType>(resType);
+    if (!shapedResType.hasStaticShape())
+      return {};
+    return DenseElementsAttr::get(shapedResType, elementResult);
   }
   if (auto op = dyn_cast<ElementsAttr>(operands[0])) {
     // Operand is ElementsAttr-derived; perform an element-wise fold by
diff --git a/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.h b/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.h
index 13e10b29c074..a7bf8796c027 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.h
@@ -13,7 +13,6 @@
 #include "mlir/Dialect/LLVMIR/Transforms/LegalizeForExport.h"
 #include "mlir/Dialect/LLVMIR/Transforms/OptimizeForNVVM.h"
 #include "mlir/Dialect/LLVMIR/Transforms/RequestCWrappers.h"
-#include "mlir/Dialect/LLVMIR/Transforms/TypeConsistency.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
diff --git a/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td b/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
index 0242cfd9abb7..11d1b9411071 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/Transforms/Passes.td
@@ -43,24 +43,6 @@ def LLVMRequestCWrappers
   let constructor = "::mlir::LLVM::createRequestCWrappersPass()";
 }
 
-def LLVMTypeConsistency
-    : Pass<"llvm-type-consistency", "::mlir::LLVM::LLVMFuncOp"> {
-  let summary = "Rewrites to improve type consistency";
-  let description = [{
-    Set of rewrites to improve the coherency of types within an LLVM dialect
-    program. This will adjust operations operating on pointers so they interpret
-    their associated pointee type as consistently as possible.
-  }];
-  let constructor = "::mlir::LLVM::createTypeConsistencyPass()";
-
-  let options = [
-    Option<"maxVectorSplitSize", "max-vector-split-size", "unsigned",
-           /*default=*/"512",
-           "Maximum size in bits of a vector value in a load or store operation"
-           " operating on multiple elements that should still be split">,
-  ];
-}
-
 def NVVMOptimizeForTarget : Pass<"llvm-optimize-for-nvvm-target"> {
   let summary = "Optimize NVVM IR";
   let constructor = "::mlir::NVVM::createOptimizeForTargetPass()";
diff --git a/mlir/include/mlir/Dialect/LLVMIR/Transforms/TypeConsistency.h b/mlir/include/mlir/Dialect/LLVMIR/Transforms/TypeConsistency.h
deleted file mode 100644
index a4bb380b99b8..000000000000
--- a/mlir/include/mlir/Dialect/LLVMIR/Transforms/TypeConsistency.h
+++ /dev/null
@@ -1,73 +0,0 @@
-//===- TypeConsistency.h - Rewrites to improve type consistency -----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Set of rewrites to improve the coherency of types within an LLVM dialect
-// program. This will adjust operations around a given pointer so they interpret
-// its pointee type as consistently as possible.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_DIALECT_LLVMIR_TRANSFORMS_TYPECONSISTENCY_H
-#define MLIR_DIALECT_LLVMIR_TRANSFORMS_TYPECONSISTENCY_H
-
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-namespace LLVM {
-
-#define GEN_PASS_DECL_LLVMTYPECONSISTENCY
-#include "mlir/Dialect/LLVMIR/Transforms/Passes.h.inc"
-
-/// Creates a pass that adjusts operations operating on pointers so they
-/// interpret pointee types as consistently as possible.
-std::unique_ptr<Pass> createTypeConsistencyPass();
-
-/// Canonicalizes GEPs of which the base type and the pointer's type hint do not
-/// match. This is done by replacing the original GEP into a GEP with the type
-/// hint as a base type when an element of the hinted type aligns with the
-/// original GEP.
-class CanonicalizeAlignedGep : public OpRewritePattern<GEPOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(GEPOp gep,
-                                PatternRewriter &rewriter) const override;
-};
-
-/// Splits stores which write into multiple adjacent elements of an aggregate
-/// through a pointer. Currently, integers and vector are split and stores
-/// are generated for every element being stored to in a type-consistent manner.
-/// This is done on a best-effort basis.
-class SplitStores : public OpRewritePattern<StoreOp> {
-  unsigned maxVectorSplitSize;
-
-public:
-  SplitStores(MLIRContext *context, unsigned maxVectorSplitSize)
-      : OpRewritePattern(context), maxVectorSplitSize(maxVectorSplitSize) {}
-
-  LogicalResult matchAndRewrite(StoreOp store,
-                                PatternRewriter &rewrite) const override;
-};
-
-/// Splits GEPs with more than two indices into multiple GEPs with exactly
-/// two indices. The created GEPs are then guaranteed to index into only
-/// one aggregate at a time.
-class SplitGEP : public OpRewritePattern<GEPOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(GEPOp gepOp,
-                                PatternRewriter &rewriter) const override;
-};
-
-} // namespace LLVM
-} // namespace mlir
-
-#endif // MLIR_DIALECT_LLVMIR_TRANSFORMS_TYPECONSISTENCY_H
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 5585ba27fdad..93e2c2db729d 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -1681,7 +1681,7 @@ def TileReductionUsingForOp : Op<Transform_Dialect, "structured.tile_reduction_u
   // TODO: support mixed static-dynamic (see TileUsingForallOp).
   let arguments = (ins TransformHandleTypeInterface:$target,
                    DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$tile_sizes);
-  let results = (outs TransformHandleTypeInterface:$fill_op,
+  let results = (outs Variadic<TransformHandleTypeInterface>:$fill_op,
                       TransformHandleTypeInterface:$split_linalg_op,
                       TransformHandleTypeInterface:$combining_linalg_op,
                       TransformHandleTypeInterface:$for_op);
@@ -1787,7 +1787,7 @@ def TileReductionUsingForallOp :
                    DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$num_threads,
                    DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$tile_sizes,
                    OptionalAttr<DeviceMappingArrayAttr>:$mapping);
-  let results = (outs TransformHandleTypeInterface:$fill_op,
+  let results = (outs Variadic<TransformHandleTypeInterface>:$fill_op,
                       TransformHandleTypeInterface:$split_linalg_op,
                       TransformHandleTypeInterface:$combining_linalg_op,
                       TransformHandleTypeInterface:$forall_op);
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index f77c19ed0fcc..308ce92e3552 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -876,8 +876,8 @@ struct ForallReductionTilingResult {
   Operation *parallelTiledOp;
   /// The final reduction operation merging all the partial reductions.
   Operation *mergeOp;
-  /// The op initializing the tensor used for partial reductions.
-  Operation *initialOp;
+  /// Initial values used for partial reductions.
+  SmallVector<Value> initialValues;
   /// The `scf.forall` operation that iterate over the tiles.
   scf::ForallOp loops;
 };
diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td b/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td
index 9d9b5892e1a5..3a85bf2d552f 100644
--- a/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td
+++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshBase.td
@@ -151,7 +151,9 @@ def MeshSharding : AttrDef<Mesh_Dialect, "MeshSharding"> {
 
   let extraClassDeclaration = [{
     bool operator==(::mlir::Attribute rhs) const;
+    bool operator!=(::mlir::Attribute rhs) const;
     bool operator==(::mlir::mesh::MeshShardingAttr rhs) const;
+    bool operator!=(::mlir::mesh::MeshShardingAttr rhs) const;
   }];
 
   let genVerifyDecl = 1;
diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h
index 4569b77441c3..7a24c201a39a 100644
--- a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h
+++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h
@@ -51,15 +51,26 @@ void removeTrailingEmptySubArray(SmallVector<SmallVector<T>> &array) {
 
 // Is the same tensor replicated on all processes.
 inline bool isFullReplication(MeshShardingAttr attr) {
-  return attr.getPartialAxes().empty() && attr.getSplitAxes().empty();
+  return attr.getPartialAxes().empty() &&
+         llvm::all_of(attr.getSplitAxes(), [](MeshAxesAttr axes) {
+           return axes.asArrayRef().empty();
+         });
 }
 
-inline mesh::MeshOp getMesh(Operation *op, FlatSymbolRefAttr meshSymbol,
-                            SymbolTableCollection &symbolTableCollection) {
+inline mesh::MeshOp
+getMeshOrNull(Operation *op, FlatSymbolRefAttr meshSymbol,
+              SymbolTableCollection &symbolTableCollection) {
   return symbolTableCollection.lookupNearestSymbolFrom<mesh::MeshOp>(
       op, meshSymbol);
 }
 
+inline mesh::MeshOp getMesh(Operation *op, FlatSymbolRefAttr meshSymbol,
+                            SymbolTableCollection &symbolTableCollection) {
+  mesh::MeshOp meshOp = getMeshOrNull(op, meshSymbol, symbolTableCollection);
+  assert(meshOp);
+  return meshOp;
+}
+
 // Get the corresponding mesh op using the standard attribute nomenclature.
 template <typename Op>
 mesh::MeshOp getMesh(Op op, SymbolTableCollection &symbolTableCollection) {
@@ -128,6 +139,17 @@ ShapedType shardShapedType(ShapedType shape, MeshOp mesh,
 // `sharding` in that case must be null.
 Type shardType(Type type, MeshOp mesh, MeshShardingAttr sharding);
 
+// Insert shard op if there is not one that already has the same sharding.
+// May insert resharding if required.
+void maybeInsertTargetShardingAnnotation(MeshShardingAttr sharding,
+                                         OpOperand &operand,
+                                         OpBuilder &builder);
+void maybeInsertTargetShardingAnnotation(MeshShardingAttr sharding,
+                                         OpResult result, OpBuilder &builder);
+void maybeInsertSourceShardingAnnotation(MeshShardingAttr sharding,
+                                         OpOperand &operand,
+                                         OpBuilder &builder);
+
 } // namespace mesh
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.h b/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.h
index c47a7ddd3f9c..216d7e10296d 100644
--- a/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.h
+++ b/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.h
@@ -37,6 +37,11 @@ struct ShardingOption {
   ShardingOption() = default;
   ShardingOption(ShardingArray shardingArray, FlatSymbolRefAttr mesh)
       : shardingArray(std::move(shardingArray)), mesh(mesh) {}
+  static ShardingOption makeEmpty() {
+    auto res = ShardingOption();
+    res.empty = true;
+    return res;
+  }
 };
 
 // This method retrieves the 'MeshShardingAttr' attribute from a given operation
@@ -56,6 +61,10 @@ defaultGetShardingOption(Operation *op,
                          ArrayRef<MeshShardingAttr> operandShardings,
                          ArrayRef<MeshShardingAttr> resultShardings);
 
+FailureOr<SmallVector<MeshShardingAttr>>
+defaultGetShardingAnnotations(Operation *op,
+                              const ShardingOption &shardingOption);
+
 LogicalResult
 defaultAddShardingAnnotations(Operation *op, OpBuilder &b,
                               const ShardingOption &shardingOption);
diff --git a/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.td b/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.td
index 1f75135f4288..47a74f619f56 100644
--- a/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.td
+++ b/mlir/include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.td
@@ -75,8 +75,11 @@ def ShardingInterface : OpInterface<"ShardingInterface"> {
       InterfaceMethod<
         /*desc=*/[{
           Given that certain operands or results of the operation may have
-          sharding annotations, this method leverages this information to deduce
-          how the operation should be sharded.
+          sharding annotations, this method leverages this information to
+          deduce how the operation should be sharded.
+          The passed sharding may be incomplete, this gives freedom for the
+          op to select the most appropriate shardings for all the operands
+          and results and the op itself.
         }],
         /*retTy=*/"FailureOr<ShardingOption>",
         /*methodName=*/"getShardingOption",
@@ -92,6 +95,24 @@ def ShardingInterface : OpInterface<"ShardingInterface"> {
       >,
       InterfaceMethod<
         /*desc=*/[{
+          Based on a given ShardingOption, get the operand and result
+          operations for the operands and results sharding annotations.
+          This is what shardings the operands and results need to have in order
+          to shard the op according to shardingOption.
+        }],
+        /*retTy=*/"FailureOr<SmallVector<MeshShardingAttr>>",
+        /*methodName=*/"getShardingAnnotations",
+        /*args=*/(ins
+          "const ShardingOption &":$shardingOption
+        ),
+        /*methodBody=*/"",
+        /*defaultImplementation=*/[{
+          return detail::defaultGetShardingAnnotations(
+            $_op.getOperation(), shardingOption);
+        }]
+      >,
+      InterfaceMethod<
+        /*desc=*/[{
           Based on a given ShardingOption, this method adds `mesh.shard`
           operations for the operands and results that previously lacked
           sharding annotations.
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 122abbe7cc97..dc9ac2b9de22 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -152,13 +152,9 @@ def ParallelOp : OpenMP_Op<"parallel", [
     variable should be passed into the reduction region by value or by reference
     in `reduction_vars_byref`. Each reduction is identified by the accumulator
     it uses and accumulators must not be repeated in the same reduction. The
-    `omp.reduction` operation accepts the accumulator and a partial value which
-    is considered to be produced by the thread for the given reduction. If
-    multiple values are produced for the same accumulator, i.e. there are
-    multiple `omp.reduction`s, the last value is taken. The reduction
-    declaration specifies how to combine the values from each thread into the
-    final value, which is available in the accumulator after all the threads
-    complete.
+    reduction declaration specifies how to combine the values from each thread
+    into the final value, which is available in the accumulator after all the
+    threads complete.
 
     The optional $proc_bind_val attribute controls the thread affinity for the execution
     of the parallel region.
@@ -307,13 +303,9 @@ def SectionsOp : OpenMP_Op<"sections", [AttrSizedOperandSegments,
     accumulator variables in `reduction_vars` and symbols referring to reduction
     declarations in the `reductions` attribute. Each reduction is identified
     by the accumulator it uses and accumulators must not be repeated in the same
-    reduction. The `omp.reduction` operation accepts the accumulator and a
-    partial value which is considered to be produced by the section for the
-    given reduction. If multiple values are produced for the same accumulator,
-    i.e. there are multiple `omp.reduction`s, the last value is taken. The
-    reduction declaration specifies how to combine the values from each section
-    into the final value, which is available in the accumulator after all the
-    sections complete.
+    reduction. The reduction declaration specifies how to combine the values
+    from each section into the final value, which is available in the
+    accumulator after all the sections complete.
 
     The $allocators_vars and $allocate_vars parameters are a variadic list of values
     that specify the memory allocator to be used to obtain storage for private values.
@@ -912,11 +904,7 @@ def TaskloopOp : OpenMP_Op<"taskloop", [AttrSizedOperandSegments,
     variables in `reduction_vars` or `in_reduction_vars` and symbols referring
     to reduction declarations in the `reductions` or `in_reductions` attribute.
     Each reduction is identified by the accumulator it uses and accumulators
-    must not be repeated in the same reduction. The `omp.reduction` operation
-    accepts the accumulator and a partial value which is considered to be
-    produced by the current loop iteration for the given reduction. If multiple
-    values are produced for the same accumulator, i.e. there are multiple
-    `omp.reduction`s, the last value is taken. The reduction declaration
+    must not be repeated in the same reduction. The reduction declaration
     specifies how to combine the values from each iteration into the final
     value, which is available in the accumulator after the loop completes.
 
@@ -2159,24 +2147,4 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [Symbol,
   let hasRegionVerifier = 1;
 }
 
-//===----------------------------------------------------------------------===//
-// 2.19.5.4 reduction clause
-//===----------------------------------------------------------------------===//
-
-def ReductionOp : OpenMP_Op<"reduction"> {
-  let summary = "reduction construct";
-  let description = [{
-    Indicates the value that is produced by the current reduction-participating
-    entity for a reduction requested in some ancestor. The reduction is
-    identified by the accumulator, but the value of the accumulator may not be
-    updated immediately.
-  }];
-
-  let arguments= (ins AnyType:$operand, OpenMP_PointerLikeType:$accumulator);
-  let assemblyFormat = [{
-    $operand `,` $accumulator attr-dict `:` type($operand) `,` type($accumulator)
-  }];
-  let hasVerifier = 1;
-}
-
 #endif // OPENMP_OPS
diff --git a/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td b/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td
index 3ef899d3376b..f99cbccd243e 100644
--- a/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td
+++ b/mlir/include/mlir/Dialect/Polynomial/IR/Polynomial.td
@@ -52,8 +52,8 @@ def Polynomial_AddOp : Polynomial_BinaryOp<"add", [Commutative]> {
     // add two polynomials modulo x^1024 - 1
     #poly = #polynomial.int_polynomial<x**1024 - 1>
     #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536:i32, polynomialModulus=#poly>
-    %0 = polynomial.constant #polynomial.int_polynomial<1 + x**2> : !polynomial.polynomial<#ring>
-    %1 = polynomial.constant #polynomial.int_polynomial<x**5 - x + 1> : !polynomial.polynomial<#ring>
+    %0 = polynomial.constant int<1 + x**2> : !polynomial.polynomial<#ring>
+    %1 = polynomial.constant int<x**5 - x + 1> : !polynomial.polynomial<#ring>
     %2 = polynomial.add %0, %1 : !polynomial.polynomial<#ring>
     ```
   }];
@@ -76,8 +76,8 @@ def Polynomial_SubOp : Polynomial_BinaryOp<"sub"> {
     // subtract two polynomials modulo x^1024 - 1
     #poly = #polynomial.int_polynomial<x**1024 - 1>
     #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536:i32, polynomialModulus=#poly>
-    %0 = polynomial.constant #polynomial.int_polynomial<1 + x**2> : !polynomial.polynomial<#ring>
-    %1 = polynomial.constant #polynomial.int_polynomial<x**5 - x + 1> : !polynomial.polynomial<#ring>
+    %0 = polynomial.constant int<1 + x**2> : !polynomial.polynomial<#ring>
+    %1 = polynomial.constant int<x**5 - x + 1> : !polynomial.polynomial<#ring>
     %2 = polynomial.sub %0, %1 : !polynomial.polynomial<#ring>
     ```
   }];
@@ -101,8 +101,8 @@ def Polynomial_MulOp : Polynomial_BinaryOp<"mul", [Commutative]> {
     // multiply two polynomials modulo x^1024 - 1
     #poly = #polynomial.int_polynomial<x**1024 - 1>
     #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536:i32, polynomialModulus=#poly>
-    %0 = polynomial.constant #polynomial.int_polynomial<1 + x**2> : !polynomial.polynomial<#ring>
-    %1 = polynomial.constant #polynomial.int_polynomial<x**5 - x + 1> : !polynomial.polynomial<#ring>
+    %0 = polynomial.constant int<1 + x**2> : !polynomial.polynomial<#ring>
+    %1 = polynomial.constant int<x**5 - x + 1> : !polynomial.polynomial<#ring>
     %2 = polynomial.mul %0, %1 : !polynomial.polynomial<#ring>
     ```
   }];
@@ -126,7 +126,7 @@ def Polynomial_MulScalarOp : Polynomial_Op<"mul_scalar", [
     // multiply two polynomials modulo x^1024 - 1
     #poly = #polynomial.int_polynomial<x**1024 - 1>
     #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536:i32, polynomialModulus=#poly>
-    %0 = polynomial.constant #polynomial.int_polynomial<1 + x**2> : !polynomial.polynomial<#ring>
+    %0 = polynomial.constant int<1 + x**2> : !polynomial.polynomial<#ring>
     %1 = arith.constant 3 : i32
     %2 = polynomial.mul_scalar %0, %1 : !polynomial.polynomial<#ring>, i32
     ```
@@ -157,7 +157,7 @@ def Polynomial_LeadingTermOp: Polynomial_Op<"leading_term"> {
     ```mlir
     #poly = #polynomial.int_polynomial<x**1024 - 1>
     #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536:i32, polynomialModulus=#poly>
-    %0 = polynomial.constant #polynomial.int_polynomial<1 + x**2> : !polynomial.polynomial<#ring>
+    %0 = polynomial.constant int<1 + x**2> : !polynomial.polynomial<#ring>
     %1, %2 = polynomial.leading_term %0 : !polynomial.polynomial<#ring> -> (index, i32)
     ```
   }];
@@ -272,29 +272,29 @@ def Polynomial_ToTensorOp : Polynomial_Op<"to_tensor", [Pure]> {
   let hasVerifier = 1;
 }
 
-def Polynomial_AnyPolynomialAttr : AnyAttrOf<[
-  Polynomial_FloatPolynomialAttr,
-  Polynomial_IntPolynomialAttr
+def Polynomial_AnyTypedPolynomialAttr : AnyAttrOf<[
+  Polynomial_TypedFloatPolynomialAttr,
+  Polynomial_TypedIntPolynomialAttr
 ]>;
 
 // Not deriving from Polynomial_Op due to need for custom assembly format
-def Polynomial_ConstantOp : Op<Polynomial_Dialect, "constant", [Pure]> {
+def Polynomial_ConstantOp : Op<Polynomial_Dialect, "constant",
+    [Pure, InferTypeOpAdaptor]> {
   let summary = "Define a constant polynomial via an attribute.";
   let description = [{
     Example:
 
     ```mlir
-    #poly = #polynomial.int_polynomial<x**1024 - 1>
-    #ring = #polynomial.ring<coefficientType=i32, coefficientModulus=65536:i32, polynomialModulus=#poly>
-    %0 = polynomial.constant #polynomial.int_polynomial<1 + x**2> : !polynomial.polynomial<#ring>
+    !int_poly_ty = !polynomial.polynomial<ring=<coefficientType=i32>>
+    %0 = polynomial.constant int<1 + x**2> : !int_poly_ty
 
-    #float_ring = #polynomial.ring<coefficientType=f32>
-    %0 = polynomial.constant #polynomial.float_polynomial<0.5 + 1.3e06 x**2> : !polynomial.polynomial<#float_ring>
+    !float_poly_ty = !polynomial.polynomial<ring=<coefficientType=f32>>
+    %1 = polynomial.constant float<0.5 + 1.3e06 x**2> : !float_poly_ty
     ```
   }];
-  let arguments = (ins Polynomial_AnyPolynomialAttr:$value);
+  let arguments = (ins Polynomial_AnyTypedPolynomialAttr:$value);
   let results = (outs Polynomial_PolynomialType:$output);
-  let assemblyFormat = "attr-dict `:` type($output)";
+  let hasCustomAssemblyFormat = 1;
 }
 
 def Polynomial_NTTOp : Polynomial_Op<"ntt", [Pure]> {
diff --git a/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialAttributes.td b/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialAttributes.td
index e5dbfa7fa21e..655020adf808 100644
--- a/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialAttributes.td
+++ b/mlir/include/mlir/Dialect/Polynomial/IR/PolynomialAttributes.td
@@ -18,7 +18,7 @@ class Polynomial_Attr<string name, string attrMnemonic, list<Trait> traits = []>
 }
 
 def Polynomial_IntPolynomialAttr : Polynomial_Attr<"IntPolynomial", "int_polynomial"> {
-  let summary = "An attribute containing a single-variable polynomial with integer coefficients.";
+  let summary = "an attribute containing a single-variable polynomial with integer coefficients";
   let description = [{
     A polynomial attribute represents a single-variable polynomial with integer
     coefficients, which is used to define the modulus of a `RingAttr`, as well
@@ -41,7 +41,7 @@ def Polynomial_IntPolynomialAttr : Polynomial_Attr<"IntPolynomial", "int_polynom
 }
 
 def Polynomial_FloatPolynomialAttr : Polynomial_Attr<"FloatPolynomial", "float_polynomial"> {
-  let summary = "An attribute containing a single-variable polynomial with double precision floating point coefficients.";
+  let summary = "an attribute containing a single-variable polynomial with double precision floating point coefficients";
   let description = [{
     A polynomial attribute represents a single-variable polynomial with double
     precision floating point coefficients.
@@ -62,8 +62,72 @@ def Polynomial_FloatPolynomialAttr : Polynomial_Attr<"FloatPolynomial", "float_p
   let hasCustomAssemblyFormat = 1;
 }
 
+def Polynomial_TypedIntPolynomialAttr : Polynomial_Attr<
+    "TypedIntPolynomial", "typed_int_polynomial", [TypedAttrInterface]> {
+  let summary = "a typed int_polynomial";
+  let description = [{
+    Example:
+
+    ```mlir
+    !poly_ty = !polynomial.polynomial<ring=<coefficientType=i32>>
+    #poly = int<1 x**7 + 4> : !poly_ty
+    #poly_verbose = #polynomial.typed_int_polynomial<1 x**7 + 4> : !poly_ty
+    ```
+  }];
+  let parameters = (ins "::mlir::Type":$type, "::mlir::polynomial::IntPolynomialAttr":$value);
+  let assemblyFormat = "$value `:` $type";
+  let builders = [
+    AttrBuilderWithInferredContext<(ins "Type":$type,
+                                        "const IntPolynomial &":$value), [{
+      return $_get(
+        type.getContext(),
+        type,
+        IntPolynomialAttr::get(type.getContext(), value));
+    }]>,
+    AttrBuilderWithInferredContext<(ins "Type":$type,
+                                        "const Attribute &":$value), [{
+      return $_get(type.getContext(), type, ::llvm::cast<IntPolynomialAttr>(value));
+    }]>
+  ];
+  let extraClassDeclaration = [{
+    using ValueType = ::mlir::Attribute;
+  }];
+}
+
+def Polynomial_TypedFloatPolynomialAttr : Polynomial_Attr<
+    "TypedFloatPolynomial", "typed_float_polynomial", [TypedAttrInterface]> {
+  let summary = "a typed float_polynomial";
+  let description = [{
+    Example:
+
+    ```mlir
+    !poly_ty = !polynomial.polynomial<ring=<coefficientType=f32>>
+    #poly = float<1.4 x**7 + 4.5> : !poly_ty
+    #poly_verbose = #polynomial.typed_float_polynomial<1.4 x**7 + 4.5> : !poly_ty
+    ```
+  }];
+  let parameters = (ins "::mlir::Type":$type, "::mlir::polynomial::FloatPolynomialAttr":$value);
+  let assemblyFormat = "$value `:` $type";
+  let builders = [
+    AttrBuilderWithInferredContext<(ins "Type":$type,
+                                        "const FloatPolynomial &":$value), [{
+      return $_get(
+        type.getContext(),
+        type,
+        FloatPolynomialAttr::get(type.getContext(), value));
+    }]>,
+    AttrBuilderWithInferredContext<(ins "Type":$type,
+                                        "const Attribute &":$value), [{
+      return $_get(type.getContext(), type, ::llvm::cast<FloatPolynomialAttr>(value));
+    }]>
+  ];
+  let extraClassDeclaration = [{
+    using ValueType = ::mlir::Attribute;
+  }];
+}
+
 def Polynomial_RingAttr : Polynomial_Attr<"Ring", "ring"> {
-  let summary = "An attribute specifying a polynomial ring.";
+  let summary = "an attribute specifying a polynomial ring";
   let description = [{
     A ring describes the domain in which polynomial arithmetic occurs. The ring
     attribute in `polynomial` represents the more specific case of polynomials
diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
index 965ef9e203be..6d567171e185 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h
@@ -250,8 +250,8 @@ struct SCFReductionTilingResult {
   Operation *parallelTiledOp;
   /// The final reduction operation merging all the partial reductions.
   Operation *mergeOp;
-  /// Initial op
-  Operation *initialOp;
+  /// Initial values used for reduction.
+  SmallVector<Value> initialValues;
   /// The loop operations that iterate over the tiles.
   SmallVector<LoopLikeOpInterface> loops;
 };
diff --git a/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h
index e8a09c474104..dd6b0e868256 100644
--- a/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h
@@ -59,8 +59,8 @@ void populateDropRedundantInsertSliceRankExpansionPatterns(
 /// `tensor.collapse_shape` into other ops.
 void populateReassociativeReshapeFoldingPatterns(RewritePatternSet &patterns);
 
-/// Populates `patterns` with patterns that fold tensor.empty with
-/// tensor.[extract_slice|expand_shape|collapse_shape].
+/// Populates `patterns` with patterns that fold tensor.empty with its
+/// consumers.
 ///
 /// If `singleUseOnly` is set to "true", only tensor.empty ops with a single
 /// use are folded.
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 332b5ad08ced..2bb7540ef0b0 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -543,6 +543,86 @@ def Vector_InterleaveOp :
   }];
 }
 
+class ResultIsHalfSourceVectorType<string result> : TypesMatchWith<
+  "the trailing dimension of the results is half the width of source trailing dimension",
+  "source", result,
+  [{
+    [&]() -> ::mlir::VectorType {
+      auto vectorType = ::llvm::cast<mlir::VectorType>($_self);
+      ::mlir::VectorType::Builder builder(vectorType);
+      auto lastDim = vectorType.getRank() - 1;
+      auto newDimSize = vectorType.getDimSize(lastDim) / 2;;
+      if (newDimSize <= 0)
+         return vectorType; // (invalid input type)
+      return builder.setDim(lastDim, newDimSize);
+    }()
+  }]
+>;
+
+def SourceVectorEvenElementCount : PredOpTrait<
+  "the trailing dimension of the source vector has an even number of elements",
+  CPred<[{
+    [&](){
+      auto srcVec = getSourceVectorType();
+      return srcVec.getDimSize(srcVec.getRank() - 1) % 2 == 0;
+    }()
+  }]>
+>;
+
+def Vector_DeinterleaveOp :
+  Vector_Op<"deinterleave", [Pure,
+    SourceVectorEvenElementCount,
+    ResultIsHalfSourceVectorType<"res1">,
+    AllTypesMatch<["res1", "res2"]>
+    ]> {
+      let summary = "constructs two vectors by deinterleaving an input vector";
+      let description = [{
+        The deinterleave operation constructs two vectors from a single input
+        vector. The first result vector contains the elements from even indexes
+        of the input, and the second contains elements from odd indexes. This is
+        the inverse of a `vector.interleave` operation.
+
+        Each output's trailing dimension is half of the size of the input
+        vector's trailing dimension. This operation requires the input vector
+        to have a rank > 0 and an even number of elements in its trailing
+        dimension.
+
+        The operation supports scalable vectors.
+
+        Example:
+        ```mlir
+        %0, %1 = vector.deinterleave %a
+                   : vector<8xi8> -> vector<4xi8>
+        %2, %3 = vector.deinterleave %b
+                   : vector<2x8xi8> -> vector<2x4xi8>
+        %4, %5 = vector.deinterleave %c
+                   : vector<2x8x4xi8> -> vector<2x8x2xi8>
+        %6, %7 = vector.deinterleave %d
+                   : vector<[8]xf32> -> vector<[4]xf32>
+        %8, %9 = vector.deinterleave %e
+                   : vector<2x[6]xf64> -> vector<2x[3]xf64>
+        %10, %11 = vector.deinterleave %f
+                   : vector<2x4x[6]xf64> -> vector<2x4x[3]xf64>
+        ```
+      }];
+
+      let arguments = (ins AnyVector:$source);
+      let results = (outs AnyVector:$res1, AnyVector:$res2);
+
+      let assemblyFormat = [{
+        $source attr-dict `:` type($source) `->` type($res1)
+      }];
+
+      let extraClassDeclaration = [{
+        VectorType getSourceVectorType() {
+          return ::llvm::cast<VectorType>(getSource().getType());
+        }
+        VectorType getResultVectorType() {
+          return ::llvm::cast<VectorType>(getRes1().getType());
+        }
+      }];
+    }
+
 def Vector_ExtractElementOp :
   Vector_Op<"extractelement", [Pure,
      TypesMatchWith<"result type matches element type of vector operand",
diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td
index 7866ac24c1cc..4481e56615b8 100644
--- a/mlir/include/mlir/IR/OpBase.td
+++ b/mlir/include/mlir/IR/OpBase.td
@@ -670,16 +670,4 @@ class TCopVTEtAreSameAt<list<int> indices> : CPred<
       "[this](unsigned i) { return getElementTypeOrSelf(this->getOperand(i)); "
       "}))">;
 
-class AnyScalarTypeMatch<list<string> names> :
-    AnyMatchOperatorTrait<names, "$_self.getType().isSignlessInteger(1)",
-                          "scalar type">;
-
-class ScalarConditionOrMatchingShape<list<string> names> :
-    PredOpTrait<
-        !head(names) # " is scalar or has matching shape",
-        Or<[AnyScalarTypeMatch<[!head(names)]>.predicate,
-            AllShapesMatch<names>.predicate]>> {
-  list<string> values = names;
-}
-
 #endif // OP_BASE
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
index 90406f555b0f..fedd7737f9ea 100644
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -14,7 +14,6 @@
 #ifndef MLIR_INITALLPASSES_H_
 #define MLIR_INITALLPASSES_H_
 
-#include "mlir/Config/mlir-config.h"
 #include "mlir/Conversion/Passes.h"
 #include "mlir/Dialect/AMDGPU/Transforms/Passes.h"
 #include "mlir/Dialect/Affine/Passes.h"
@@ -99,7 +98,7 @@ inline void registerAllPasses() {
   bufferization::registerBufferizationPipelines();
   sparse_tensor::registerSparseTensorPipelines();
   tosa::registerTosaToLinalgPipelines();
-#if MLIR_ENABLE_CUDA_CONVERSIONS
+#if LLVM_HAS_NVPTX_TARGET
   gpu::registerGPUToNVVMPipeline();
 #endif
 }
diff --git a/mlir/include/mlir/Interfaces/TilingInterface.td b/mlir/include/mlir/Interfaces/TilingInterface.td
index 66382f29c242..14d775d986d2 100644
--- a/mlir/include/mlir/Interfaces/TilingInterface.td
+++ b/mlir/include/mlir/Interfaces/TilingInterface.td
@@ -170,11 +170,11 @@ def PartialReductionOpInterface : OpInterface<"PartialReductionOpInterface"> {
           operation reduction. The tensor shape is equal to operation result
           shape with new dimension for each non zero tile size.
         }],
-        /*retType=*/"FailureOr<Operation*>",
+        /*retType=*/"FailureOr<SmallVector<Value>>",
         /*methodName=*/"generateInitialTensorForPartialReduction",
         /*args=*/(ins
             "OpBuilder &":$b,
-            "Location ":$loc,
+            "Location":$loc,
             "ArrayRef<OpFoldResult>":$sizes,
             "ArrayRef<int>":$reductionDim),
         /*methodBody=*/"",
diff --git a/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h b/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h
index 97c97c23ba82..851bb534bc7e 100644
--- a/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h
+++ b/mlir/include/mlir/Interfaces/Utils/InferIntRangeCommon.h
@@ -16,6 +16,7 @@
 
 #include "mlir/Interfaces/InferIntRangeInterface.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitmaskEnum.h"
 #include <optional>
 
 namespace mlir {
@@ -31,6 +32,18 @@ static constexpr unsigned indexMaxWidth = 64;
 
 enum class CmpMode : uint32_t { Both, Signed, Unsigned };
 
+enum class OverflowFlags : uint32_t {
+  None = 0,
+  Nsw = 1,
+  Nuw = 2,
+  LLVM_MARK_AS_BITMASK_ENUM(Nuw)
+};
+
+/// Function that performs inference on an array of `ConstantIntRanges` while
+/// taking special overflow behavior into account.
+using InferRangeWithOvfFlagsFn =
+    function_ref<ConstantIntRanges(ArrayRef<ConstantIntRanges>, OverflowFlags)>;
+
 /// Compute `inferFn` on `ranges`, whose size should be the index storage
 /// bitwidth. Then, compute the function on `argRanges` again after truncating
 /// the ranges to 32 bits. Finally, if the truncation of the 64-bit result is
@@ -60,11 +73,14 @@ ConstantIntRanges extSIRange(const ConstantIntRanges &range,
 ConstantIntRanges truncRange(const ConstantIntRanges &range,
                              unsigned destWidth);
 
-ConstantIntRanges inferAdd(ArrayRef<ConstantIntRanges> argRanges);
+ConstantIntRanges inferAdd(ArrayRef<ConstantIntRanges> argRanges,
+                           OverflowFlags ovfFlags = OverflowFlags::None);
 
-ConstantIntRanges inferSub(ArrayRef<ConstantIntRanges> argRanges);
+ConstantIntRanges inferSub(ArrayRef<ConstantIntRanges> argRanges,
+                           OverflowFlags ovfFlags = OverflowFlags::None);
 
-ConstantIntRanges inferMul(ArrayRef<ConstantIntRanges> argRanges);
+ConstantIntRanges inferMul(ArrayRef<ConstantIntRanges> argRanges,
+                           OverflowFlags ovfFlags = OverflowFlags::None);
 
 ConstantIntRanges inferDivS(ArrayRef<ConstantIntRanges> argRanges);
 
@@ -94,7 +110,8 @@ ConstantIntRanges inferOr(ArrayRef<ConstantIntRanges> argRanges);
 
 ConstantIntRanges inferXor(ArrayRef<ConstantIntRanges> argRanges);
 
-ConstantIntRanges inferShl(ArrayRef<ConstantIntRanges> argRanges);
+ConstantIntRanges inferShl(ArrayRef<ConstantIntRanges> argRanges,
+                           OverflowFlags ovfFlags = OverflowFlags::None);
 
 ConstantIntRanges inferShrS(ArrayRef<ConstantIntRanges> argRanges);
 
diff --git a/mlir/include/mlir/Pass/PassManager.h b/mlir/include/mlir/Pass/PassManager.h
index 1b2e6a3bc82b..b3e427588173 100644
--- a/mlir/include/mlir/Pass/PassManager.h
+++ b/mlir/include/mlir/Pass/PassManager.h
@@ -18,8 +18,8 @@
 #include "llvm/Support/raw_ostream.h"
 
 #include <functional>
-#include <vector>
 #include <optional>
+#include <vector>
 
 namespace mlir {
 class AnalysisManager;
@@ -387,6 +387,43 @@ public:
       bool printAfterOnlyOnFailure = false, raw_ostream &out = llvm::errs(),
       OpPrintingFlags opPrintingFlags = OpPrintingFlags());
 
+  /// Similar to `enableIRPrinting` above, except that instead of printing
+  /// the IR to a single output stream, the instrumentation will print the
+  /// output of each pass to a separate file. The files will be organized into a
+  /// directory tree rooted at `printTreeDir`. The directories mirror the
+  /// nesting structure of the IR. For example, if the IR is congruent to the
+  /// pass-pipeline "builtin.module(passA,passB,func.func(passC,passD),passE)",
+  /// and `printTreeDir=/tmp/pipeline_output`, then then the tree file tree
+  /// created will look like:
+  ///
+  /// ```
+  /// /tmp/pass_output
+  /// ├── builtin_module_the_symbol_name
+  /// │   ├── 0_passA.mlir
+  /// │   ├── 1_passB.mlir
+  /// │   ├── 2_passE.mlir
+  /// │   ├── func_func_my_func_name
+  /// │   │   ├── 1_0_passC.mlir
+  /// │   │   ├── 1_1__passD.mlir
+  /// │   ├── func_func_my_other_func_name
+  /// │   │   ├── 1_0_passC.mlir
+  /// │   │   ├── 1_1_passD.mlir
+  /// ```
+  ///
+  /// The subdirectories are given names that reflect the parent operation name
+  /// and symbol name (if present). The output MLIR files are prefixed using an
+  /// atomic counter to indicate the order the passes were printed in and to
+  /// prevent any potential name collisions.
+  void enableIRPrintingToFileTree(
+      std::function<bool(Pass *, Operation *)> shouldPrintBeforePass =
+          [](Pass *, Operation *) { return true; },
+      std::function<bool(Pass *, Operation *)> shouldPrintAfterPass =
+          [](Pass *, Operation *) { return true; },
+      bool printModuleScope = true, bool printAfterOnlyOnChange = true,
+      bool printAfterOnlyOnFailure = false,
+      llvm::StringRef printTreeDir = ".pass_manager_output",
+      OpPrintingFlags opPrintingFlags = OpPrintingFlags());
+
   //===--------------------------------------------------------------------===//
   // Pass Timing
 
diff --git a/mlir/include/mlir/Transforms/RegionUtils.h b/mlir/include/mlir/Transforms/RegionUtils.h
index f65d0d44eef4..06eebff201d1 100644
--- a/mlir/include/mlir/Transforms/RegionUtils.h
+++ b/mlir/include/mlir/Transforms/RegionUtils.h
@@ -87,10 +87,6 @@ LogicalResult eraseUnreachableBlocks(RewriterBase &rewriter,
 LogicalResult runRegionDCE(RewriterBase &rewriter,
                            MutableArrayRef<Region> regions);
 
-/// Get a list of blocks that is sorted according to dominance. This sort is
-/// stable.
-SetVector<Block *> getBlocksSortedByDominance(Region &region);
-
 } // namespace mlir
 
 #endif // MLIR_TRANSFORMS_REGIONUTILS_H_
diff --git a/mlir/lib/Analysis/CMakeLists.txt b/mlir/lib/Analysis/CMakeLists.txt
index 005814ddbec7..38d8415d81c7 100644
--- a/mlir/lib/Analysis/CMakeLists.txt
+++ b/mlir/lib/Analysis/CMakeLists.txt
@@ -6,6 +6,7 @@ set(LLVM_OPTIONAL_SOURCES
   Liveness.cpp
   CFGLoopInfo.cpp
   SliceAnalysis.cpp
+  TopologicalSortUtils.cpp
 
   AliasAnalysis/LocalAliasAnalysis.cpp
 
@@ -28,6 +29,7 @@ add_mlir_library(MLIRAnalysis
   Liveness.cpp
   CFGLoopInfo.cpp
   SliceAnalysis.cpp
+  TopologicalSortUtils.cpp
 
   AliasAnalysis/LocalAliasAnalysis.cpp
 
diff --git a/mlir/lib/Analysis/Liveness.cpp b/mlir/lib/Analysis/Liveness.cpp
index a8e0daeabf40..e3245d68b369 100644
--- a/mlir/lib/Analysis/Liveness.cpp
+++ b/mlir/lib/Analysis/Liveness.cpp
@@ -72,6 +72,10 @@ struct BlockInfoBuilder {
         defValues.insert(result);
       for (Value operand : op->getOperands())
         useValues.insert(operand);
+      for (Region &region : op->getRegions())
+        for (Block &child : region.getBlocks())
+          for (BlockArgument arg : child.getArguments())
+            defValues.insert(arg);
     });
     llvm::set_subtract(useValues, defValues);
   }
diff --git a/mlir/lib/Analysis/SliceAnalysis.cpp b/mlir/lib/Analysis/SliceAnalysis.cpp
index 26fe8e3dc081..2b1cf411ceee 100644
--- a/mlir/lib/Analysis/SliceAnalysis.cpp
+++ b/mlir/lib/Analysis/SliceAnalysis.cpp
@@ -11,7 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/SliceAnalysis.h"
-#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
+#include "mlir/IR/Block.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Support/LLVM.h"
@@ -164,62 +165,6 @@ mlir::getSlice(Operation *op, const BackwardSliceOptions &backwardSliceOptions,
   return topologicalSort(slice);
 }
 
-namespace {
-/// DFS post-order implementation that maintains a global count to work across
-/// multiple invocations, to help implement topological sort on multi-root DAGs.
-/// We traverse all operations but only record the ones that appear in
-/// `toSort` for the final result.
-struct DFSState {
-  DFSState(const SetVector<Operation *> &set) : toSort(set), seen() {}
-  const SetVector<Operation *> &toSort;
-  SmallVector<Operation *, 16> topologicalCounts;
-  DenseSet<Operation *> seen;
-};
-} // namespace
-
-static void dfsPostorder(Operation *root, DFSState *state) {
-  SmallVector<Operation *> queue(1, root);
-  std::vector<Operation *> ops;
-  while (!queue.empty()) {
-    Operation *current = queue.pop_back_val();
-    ops.push_back(current);
-    for (Operation *op : current->getUsers())
-      queue.push_back(op);
-    for (Region &region : current->getRegions()) {
-      for (Operation &op : region.getOps())
-        queue.push_back(&op);
-    }
-  }
-
-  for (Operation *op : llvm::reverse(ops)) {
-    if (state->seen.insert(op).second && state->toSort.count(op) > 0)
-      state->topologicalCounts.push_back(op);
-  }
-}
-
-SetVector<Operation *>
-mlir::topologicalSort(const SetVector<Operation *> &toSort) {
-  if (toSort.empty()) {
-    return toSort;
-  }
-
-  // Run from each root with global count and `seen` set.
-  DFSState state(toSort);
-  for (auto *s : toSort) {
-    assert(toSort.count(s) == 1 && "NYI: multi-sets not supported");
-    dfsPostorder(s, &state);
-  }
-
-  // Reorder and return.
-  SetVector<Operation *> res;
-  for (auto it = state.topologicalCounts.rbegin(),
-            eit = state.topologicalCounts.rend();
-       it != eit; ++it) {
-    res.insert(*it);
-  }
-  return res;
-}
-
 /// Returns true if `value` (transitively) depends on iteration-carried values
 /// of the given `ancestorOp`.
 static bool dependsOnCarriedVals(Value value,
diff --git a/mlir/lib/Transforms/Utils/TopologicalSortUtils.cpp b/mlir/lib/Analysis/TopologicalSortUtils.cpp
index f3a9d217f2c9..c406960fdecc 100644
--- a/mlir/lib/Transforms/Utils/TopologicalSortUtils.cpp
+++ b/mlir/lib/Analysis/TopologicalSortUtils.cpp
@@ -1,4 +1,4 @@
-//===- TopologicalSortUtils.h - Topological sort utilities ------*- C++ -*-===//
+//===- TopologicalSortUtils.cpp - Topological sort utilities --------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Transforms/TopologicalSortUtils.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
+#include "mlir/IR/Block.h"
 #include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/RegionGraphTraits.h"
+
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
 
 using namespace mlir;
 
@@ -146,3 +151,135 @@ bool mlir::computeTopologicalSorting(
 
   return allOpsScheduled;
 }
+
+SetVector<Block *> mlir::getBlocksSortedByDominance(Region &region) {
+  // For each block that has not been visited yet (i.e. that has no
+  // predecessors), add it to the list as well as its successors.
+  SetVector<Block *> blocks;
+  for (Block &b : region) {
+    if (blocks.count(&b) == 0) {
+      llvm::ReversePostOrderTraversal<Block *> traversal(&b);
+      blocks.insert(traversal.begin(), traversal.end());
+    }
+  }
+  assert(blocks.size() == region.getBlocks().size() &&
+         "some blocks are not sorted");
+
+  return blocks;
+}
+
+namespace {
+class TopoSortHelper {
+public:
+  explicit TopoSortHelper(const SetVector<Operation *> &toSort)
+      : toSort(toSort) {}
+
+  /// Executes the topological sort of the operations this instance was
+  /// constructed with. This function will destroy the internal state of the
+  /// instance.
+  SetVector<Operation *> sort() {
+    if (toSort.size() <= 1) {
+      // Note: Creates a copy on purpose.
+      return toSort;
+    }
+
+    // First, find the root region to start the traversal through the IR. This
+    // additionally enriches the internal caches with all relevant ancestor
+    // regions and blocks.
+    Region *rootRegion = findCommonAncestorRegion();
+    assert(rootRegion && "expected all ops to have a common ancestor");
+
+    // Sort all elements in `toSort` by traversing the IR in the appropriate
+    // order.
+    SetVector<Operation *> result = topoSortRegion(*rootRegion);
+    assert(result.size() == toSort.size() &&
+           "expected all operations to be present in the result");
+    return result;
+  }
+
+private:
+  /// Computes the closest common ancestor region of all operations in `toSort`.
+  Region *findCommonAncestorRegion() {
+    // Map to count the number of times a region was encountered.
+    DenseMap<Region *, size_t> regionCounts;
+    size_t expectedCount = toSort.size();
+
+    // Walk the region tree for each operation towards the root and add to the
+    // region count.
+    Region *res = nullptr;
+    for (Operation *op : toSort) {
+      Region *current = op->getParentRegion();
+      // Store the block as an ancestor block.
+      ancestorBlocks.insert(op->getBlock());
+      while (current) {
+        // Insert or update the count and compare it.
+        if (++regionCounts[current] == expectedCount) {
+          res = current;
+          break;
+        }
+        ancestorBlocks.insert(current->getParentOp()->getBlock());
+        current = current->getParentRegion();
+      }
+    }
+    auto firstRange = llvm::make_first_range(regionCounts);
+    ancestorRegions.insert(firstRange.begin(), firstRange.end());
+    return res;
+  }
+
+  /// Performs the dominance respecting IR walk to collect the topological order
+  /// of the operation to sort.
+  SetVector<Operation *> topoSortRegion(Region &rootRegion) {
+    using StackT = PointerUnion<Region *, Block *, Operation *>;
+
+    SetVector<Operation *> result;
+    // Stack that stores the different IR constructs to traverse.
+    SmallVector<StackT> stack;
+    stack.push_back(&rootRegion);
+
+    // Traverse the IR in a dominance respecting pre-order walk.
+    while (!stack.empty()) {
+      StackT current = stack.pop_back_val();
+      if (auto *region = dyn_cast<Region *>(current)) {
+        // A region's blocks need to be traversed in dominance order.
+        SetVector<Block *> sortedBlocks = getBlocksSortedByDominance(*region);
+        for (Block *block : llvm::reverse(sortedBlocks)) {
+          // Only add blocks to the stack that are ancestors of the operations
+          // to sort.
+          if (ancestorBlocks.contains(block))
+            stack.push_back(block);
+        }
+        continue;
+      }
+
+      if (auto *block = dyn_cast<Block *>(current)) {
+        // Add all of the blocks operations to the stack.
+        for (Operation &op : llvm::reverse(*block))
+          stack.push_back(&op);
+        continue;
+      }
+
+      auto *op = cast<Operation *>(current);
+      if (toSort.contains(op))
+        result.insert(op);
+
+      // Add all the subregions that are ancestors of the operations to sort.
+      for (Region &subRegion : op->getRegions())
+        if (ancestorRegions.contains(&subRegion))
+          stack.push_back(&subRegion);
+    }
+    return result;
+  }
+
+  /// Operations to sort.
+  const SetVector<Operation *> &toSort;
+  /// Set containing all the ancestor regions of the operations to sort.
+  DenseSet<Region *> ancestorRegions;
+  /// Set containing all the ancestor blocks of the operations to sort.
+  DenseSet<Block *> ancestorBlocks;
+};
+} // namespace
+
+SetVector<Operation *>
+mlir::topologicalSort(const SetVector<Operation *> &toSort) {
+  return TopoSortHelper(toSort).sort();
+}
diff --git a/mlir/lib/Bindings/Python/IRAttributes.cpp b/mlir/lib/Bindings/Python/IRAttributes.cpp
index dda2003ba037..b5f31aa5dec5 100644
--- a/mlir/lib/Bindings/Python/IRAttributes.cpp
+++ b/mlir/lib/Bindings/Python/IRAttributes.cpp
@@ -15,6 +15,7 @@
 #include "PybindUtils.h"
 
 #include "llvm/ADT/ScopeExit.h"
+#include "llvm/Support/raw_ostream.h"
 
 #include "mlir-c/BuiltinAttributes.h"
 #include "mlir-c/BuiltinTypes.h"
@@ -72,6 +73,27 @@ Raises:
     type or if the buffer does not meet expectations.
 )";
 
+static const char kDenseElementsAttrGetFromListDocstring[] =
+    R"(Gets a DenseElementsAttr from a Python list of attributes.
+
+Note that it can be expensive to construct attributes individually.
+For a large number of elements, consider using a Python buffer or array instead.
+
+Args:
+  attrs: A list of attributes.
+  type: The desired shape and type of the resulting DenseElementsAttr.
+    If not provided, the element type is determined based on the type
+    of the 0th attribute and the shape is `[len(attrs)]`.
+  context: Explicit context, if not from context manager.
+
+Returns:
+  DenseElementsAttr on success.
+
+Raises:
+  ValueError: If the type of the attributes does not match the type
+    specified by `shaped_type`.
+)";
+
 static const char kDenseResourceElementsAttrGetFromBufferDocstring[] =
     R"(Gets a DenseResourceElementsAttr from a Python buffer or array.
 
@@ -648,6 +670,57 @@ public:
   using PyConcreteAttribute::PyConcreteAttribute;
 
   static PyDenseElementsAttribute
+  getFromList(py::list attributes, std::optional<PyType> explicitType,
+              DefaultingPyMlirContext contextWrapper) {
+
+    const size_t numAttributes = py::len(attributes);
+    if (numAttributes == 0)
+      throw py::value_error("Attributes list must be non-empty.");
+
+    MlirType shapedType;
+    if (explicitType) {
+      if ((!mlirTypeIsAShaped(*explicitType) ||
+           !mlirShapedTypeHasStaticShape(*explicitType))) {
+
+        std::string message;
+        llvm::raw_string_ostream os(message);
+        os << "Expected a static ShapedType for the shaped_type parameter: "
+           << py::repr(py::cast(*explicitType));
+        throw py::value_error(os.str());
+      }
+      shapedType = *explicitType;
+    } else {
+      SmallVector<int64_t> shape{static_cast<int64_t>(numAttributes)};
+      shapedType = mlirRankedTensorTypeGet(
+          shape.size(), shape.data(),
+          mlirAttributeGetType(pyTryCast<PyAttribute>(attributes[0])),
+          mlirAttributeGetNull());
+    }
+
+    SmallVector<MlirAttribute> mlirAttributes;
+    mlirAttributes.reserve(numAttributes);
+    for (const py::handle &attribute : attributes) {
+      MlirAttribute mlirAttribute = pyTryCast<PyAttribute>(attribute);
+      MlirType attrType = mlirAttributeGetType(mlirAttribute);
+      mlirAttributes.push_back(mlirAttribute);
+
+      if (!mlirTypeEqual(mlirShapedTypeGetElementType(shapedType), attrType)) {
+        std::string message;
+        llvm::raw_string_ostream os(message);
+        os << "All attributes must be of the same type and match "
+           << "the type parameter: expected=" << py::repr(py::cast(shapedType))
+           << ", but got=" << py::repr(py::cast(attrType));
+        throw py::value_error(os.str());
+      }
+    }
+
+    MlirAttribute elements = mlirDenseElementsAttrGet(
+        shapedType, mlirAttributes.size(), mlirAttributes.data());
+
+    return PyDenseElementsAttribute(contextWrapper->getRef(), elements);
+  }
+
+  static PyDenseElementsAttribute
   getFromBuffer(py::buffer array, bool signless,
                 std::optional<PyType> explicitType,
                 std::optional<std::vector<int64_t>> explicitShape,
@@ -883,6 +956,10 @@ public:
                     py::arg("type") = py::none(), py::arg("shape") = py::none(),
                     py::arg("context") = py::none(),
                     kDenseElementsAttrGetDocstring)
+        .def_static("get", PyDenseElementsAttribute::getFromList,
+                    py::arg("attrs"), py::arg("type") = py::none(),
+                    py::arg("context") = py::none(),
+                    kDenseElementsAttrGetFromListDocstring)
         .def_static("get_splat", PyDenseElementsAttribute::getSplat,
                     py::arg("shaped_type"), py::arg("element_attr"),
                     "Gets a DenseElementsAttr where all values are the same")
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 01678a9719f9..2b2792ea6c77 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -240,7 +240,20 @@ struct PyGlobalDebugFlag {
     // Debug flags.
     py::class_<PyGlobalDebugFlag>(m, "_GlobalDebug", py::module_local())
         .def_property_static("flag", &PyGlobalDebugFlag::get,
-                             &PyGlobalDebugFlag::set, "LLVM-wide debug flag");
+                             &PyGlobalDebugFlag::set, "LLVM-wide debug flag")
+        .def_static(
+            "set_types",
+            [](const std::string &type) {
+              mlirSetGlobalDebugType(type.c_str());
+            },
+            "types"_a, "Sets specific debug types to be produced by LLVM")
+        .def_static("set_types", [](const std::vector<std::string> &types) {
+          std::vector<const char *> pointers;
+          pointers.reserve(types.size());
+          for (const std::string &str : types)
+            pointers.push_back(str.c_str());
+          mlirSetGlobalDebugTypes(pointers.data(), pointers.size());
+        });
   }
 };
 
diff --git a/mlir/lib/CAPI/Debug/Debug.cpp b/mlir/lib/CAPI/Debug/Debug.cpp
index 288ecd601274..320ece4998e0 100644
--- a/mlir/lib/CAPI/Debug/Debug.cpp
+++ b/mlir/lib/CAPI/Debug/Debug.cpp
@@ -16,3 +16,21 @@
 void mlirEnableGlobalDebug(bool enable) { llvm::DebugFlag = enable; }
 
 bool mlirIsGlobalDebugEnabled() { return llvm::DebugFlag; }
+
+void mlirSetGlobalDebugType(const char *type) {
+  // Depending on the NDEBUG flag, this name can be either a function or a macro
+  // that expands to something that isn't a funciton call, so we cannot
+  // explicitly prefix it with `llvm::` or declare `using` it.
+  using namespace llvm;
+  setCurrentDebugType(type);
+}
+
+void mlirSetGlobalDebugTypes(const char **types, intptr_t n) {
+  using namespace llvm;
+  setCurrentDebugTypes(types, n);
+}
+
+bool mlirIsCurrentDebugType(const char *type) {
+  using namespace llvm;
+  return isCurrentDebugType(type);
+}
diff --git a/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp b/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
index 1447b182ccfd..0be3d76f556d 100644
--- a/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
+++ b/mlir/lib/Conversion/ArithToEmitC/ArithToEmitC.cpp
@@ -15,6 +15,7 @@
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/EmitC/IR/EmitC.h"
+#include "mlir/Tools/PDLL/AST/Types.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 using namespace mlir;
@@ -112,6 +113,93 @@ public:
   }
 };
 
+template <typename ArithOp, bool castToUnsigned>
+class CastConversion : public OpConversionPattern<ArithOp> {
+public:
+  using OpConversionPattern<ArithOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(ArithOp op, typename ArithOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    Type opReturnType = this->getTypeConverter()->convertType(op.getType());
+    if (!isa_and_nonnull<IntegerType>(opReturnType))
+      return rewriter.notifyMatchFailure(op, "expected integer result type");
+
+    if (adaptor.getOperands().size() != 1) {
+      return rewriter.notifyMatchFailure(
+          op, "CastConversion only supports unary ops");
+    }
+
+    Type operandType = adaptor.getIn().getType();
+    if (!isa_and_nonnull<IntegerType>(operandType))
+      return rewriter.notifyMatchFailure(op, "expected integer operand type");
+
+    // Signed (sign-extending) casts from i1 are not supported.
+    if (operandType.isInteger(1) && !castToUnsigned)
+      return rewriter.notifyMatchFailure(op,
+                                         "operation not supported on i1 type");
+
+    // to-i1 conversions: arith semantics want truncation, whereas (bool)(v) is
+    // equivalent to (v != 0). Implementing as (bool)(v & 0x01) gives
+    // truncation.
+    if (opReturnType.isInteger(1)) {
+      auto constOne = rewriter.create<emitc::ConstantOp>(
+          op.getLoc(), operandType, rewriter.getIntegerAttr(operandType, 1));
+      auto oneAndOperand = rewriter.create<emitc::BitwiseAndOp>(
+          op.getLoc(), operandType, adaptor.getIn(), constOne);
+      rewriter.replaceOpWithNewOp<emitc::CastOp>(op, opReturnType,
+                                                 oneAndOperand);
+      return success();
+    }
+
+    bool isTruncation = operandType.getIntOrFloatBitWidth() >
+                        opReturnType.getIntOrFloatBitWidth();
+    bool doUnsigned = castToUnsigned || isTruncation;
+
+    Type castType = opReturnType;
+    // If the op is a ui variant and the type wanted as
+    // return type isn't unsigned, we need to issue an unsigned type to do
+    // the conversion.
+    if (castType.isUnsignedInteger() != doUnsigned) {
+      castType = rewriter.getIntegerType(opReturnType.getIntOrFloatBitWidth(),
+                                         /*isSigned=*/!doUnsigned);
+    }
+
+    Value actualOp = adaptor.getIn();
+    // Adapt the signedness of the operand if necessary
+    if (operandType.isUnsignedInteger() != doUnsigned) {
+      Type correctSignednessType =
+          rewriter.getIntegerType(operandType.getIntOrFloatBitWidth(),
+                                  /*isSigned=*/!doUnsigned);
+      actualOp = rewriter.template create<emitc::CastOp>(
+          op.getLoc(), correctSignednessType, actualOp);
+    }
+
+    auto result = rewriter.template create<emitc::CastOp>(op.getLoc(), castType,
+                                                          actualOp);
+
+    // Cast to the expected output type
+    if (castType != opReturnType) {
+      result = rewriter.template create<emitc::CastOp>(op.getLoc(),
+                                                       opReturnType, result);
+    }
+
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+
+template <typename ArithOp>
+class UnsignedCastConversion : public CastConversion<ArithOp, true> {
+  using CastConversion<ArithOp, true>::CastConversion;
+};
+
+template <typename ArithOp>
+class SignedCastConversion : public CastConversion<ArithOp, false> {
+  using CastConversion<ArithOp, false>::CastConversion;
+};
+
 template <typename ArithOp, typename EmitCOp>
 class ArithOpConversion final : public OpConversionPattern<ArithOp> {
 public:
@@ -313,6 +401,10 @@ void mlir::populateArithToEmitCPatterns(TypeConverter &typeConverter,
     IntegerOpConversion<arith::SubIOp, emitc::SubOp>,
     CmpIOpConversion,
     SelectOpConversion,
+    // Truncation is guaranteed for unsigned types.
+    UnsignedCastConversion<arith::TruncIOp>,
+    SignedCastConversion<arith::ExtSIOp>,
+    UnsignedCastConversion<arith::ExtUIOp>,
     ItoFCastOpConversion<arith::SIToFPOp>,
     ItoFCastOpConversion<arith::UIToFPOp>,
     FtoICastOpConversion<arith::FPToSIOp>,
diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
index 53b44aa3241b..94b7c8d4f2fd 100644
--- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
+++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
@@ -449,61 +449,47 @@ mlir::convertFuncOpToLLVMFuncOp(FunctionOpInterface funcOp,
                                        "region types conversion failed");
   }
 
+  if (!shouldUseBarePtrCallConv(funcOp, &converter)) {
+    if (funcOp->getAttrOfType<UnitAttr>(
+            LLVM::LLVMDialect::getEmitCWrapperAttrName())) {
+      if (newFuncOp.isVarArg())
+        return funcOp.emitError("C interface for variadic functions is not "
+                                "supported yet.");
+
+      if (newFuncOp.isExternal())
+        wrapExternalFunction(rewriter, funcOp->getLoc(), converter, funcOp,
+                             newFuncOp);
+      else
+        wrapForExternalCallers(rewriter, funcOp->getLoc(), converter, funcOp,
+                               newFuncOp);
+    }
+  } else {
+    modifyFuncOpToUseBarePtrCallingConv(
+        rewriter, funcOp->getLoc(), converter, newFuncOp,
+        llvm::cast<FunctionType>(funcOp.getFunctionType()).getInputs());
+  }
+
   return newFuncOp;
 }
 
 namespace {
 
-struct FuncOpConversionBase : public ConvertOpToLLVMPattern<func::FuncOp> {
-protected:
-  using ConvertOpToLLVMPattern<func::FuncOp>::ConvertOpToLLVMPattern;
-
-  // Convert input FuncOp to LLVMFuncOp by using the LLVMTypeConverter provided
-  // to this legalization pattern.
-  FailureOr<LLVM::LLVMFuncOp>
-  convertFuncOpToLLVMFuncOp(func::FuncOp funcOp,
-                            ConversionPatternRewriter &rewriter) const {
-    return mlir::convertFuncOpToLLVMFuncOp(
-        cast<FunctionOpInterface>(funcOp.getOperation()), rewriter,
-        *getTypeConverter());
-  }
-};
-
 /// FuncOp legalization pattern that converts MemRef arguments to pointers to
 /// MemRef descriptors (LLVM struct data types) containing all the MemRef type
 /// information.
-struct FuncOpConversion : public FuncOpConversionBase {
+struct FuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
   FuncOpConversion(const LLVMTypeConverter &converter)
-      : FuncOpConversionBase(converter) {}
+      : ConvertOpToLLVMPattern(converter) {}
 
   LogicalResult
   matchAndRewrite(func::FuncOp funcOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    FailureOr<LLVM::LLVMFuncOp> newFuncOp =
-        convertFuncOpToLLVMFuncOp(funcOp, rewriter);
+    FailureOr<LLVM::LLVMFuncOp> newFuncOp = mlir::convertFuncOpToLLVMFuncOp(
+        cast<FunctionOpInterface>(funcOp.getOperation()), rewriter,
+        *getTypeConverter());
     if (failed(newFuncOp))
       return rewriter.notifyMatchFailure(funcOp, "Could not convert funcop");
 
-    if (!shouldUseBarePtrCallConv(funcOp, this->getTypeConverter())) {
-      if (funcOp->getAttrOfType<UnitAttr>(
-              LLVM::LLVMDialect::getEmitCWrapperAttrName())) {
-        if (newFuncOp->isVarArg())
-          return funcOp->emitError("C interface for variadic functions is not "
-                                   "supported yet.");
-
-        if (newFuncOp->isExternal())
-          wrapExternalFunction(rewriter, funcOp->getLoc(), *getTypeConverter(),
-                               funcOp, *newFuncOp);
-        else
-          wrapForExternalCallers(rewriter, funcOp->getLoc(),
-                                 *getTypeConverter(), funcOp, *newFuncOp);
-      }
-    } else {
-      modifyFuncOpToUseBarePtrCallingConv(rewriter, funcOp->getLoc(),
-                                          *getTypeConverter(), *newFuncOp,
-                                          funcOp.getFunctionType().getInputs());
-    }
-
     rewriter.eraseOp(funcOp);
     return success();
   }
diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
index a206c7b228d2..f6a6d1d7228a 100644
--- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
+++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
@@ -185,21 +185,6 @@ struct MapInfoOpConversion : public ConvertOpToLLVMPattern<omp::MapInfoOp> {
   }
 };
 
-struct ReductionOpConversion : public ConvertOpToLLVMPattern<omp::ReductionOp> {
-  using ConvertOpToLLVMPattern<omp::ReductionOp>::ConvertOpToLLVMPattern;
-  LogicalResult
-  matchAndRewrite(omp::ReductionOp curOp, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    if (isa<MemRefType>(curOp.getAccumulator().getType())) {
-      // TODO: Support memref type in variable operands
-      return rewriter.notifyMatchFailure(curOp, "memref is not supported yet");
-    }
-    rewriter.replaceOpWithNewOp<omp::ReductionOp>(
-        curOp, TypeRange(), adaptor.getOperands(), curOp->getAttrs());
-    return success();
-  }
-};
-
 template <typename OpType>
 struct MultiRegionOpConversion : public ConvertOpToLLVMPattern<OpType> {
   using ConvertOpToLLVMPattern<OpType>::ConvertOpToLLVMPattern;
@@ -246,9 +231,6 @@ void mlir::configureOpenMPToLLVMConversionLegality(
         return typeConverter.isLegal(op->getOperandTypes()) &&
                typeConverter.isLegal(op->getResultTypes());
       });
-  target.addDynamicallyLegalOp<mlir::omp::ReductionOp>([&](Operation *op) {
-    return typeConverter.isLegal(op->getOperandTypes());
-  });
   target.addDynamicallyLegalOp<
       mlir::omp::AtomicUpdateOp, mlir::omp::CriticalOp, mlir::omp::TargetOp,
       mlir::omp::TargetDataOp, mlir::omp::LoopNestOp,
@@ -275,11 +257,11 @@ void mlir::populateOpenMPToLLVMConversionPatterns(LLVMTypeConverter &converter,
       [&](omp::MapBoundsType type) -> Type { return type; });
 
   patterns.add<
-      AtomicReadOpConversion, MapInfoOpConversion, ReductionOpConversion,
+      AtomicReadOpConversion, MapInfoOpConversion,
       MultiRegionOpConversion<omp::DeclareReductionOp>,
       MultiRegionOpConversion<omp::PrivateClauseOp>,
       RegionOpConversion<omp::CriticalOp>, RegionOpConversion<omp::LoopNestOp>,
-      RegionOpConversion<omp::MasterOp>, ReductionOpConversion,
+      RegionOpConversion<omp::MasterOp>,
       RegionOpConversion<omp::OrderedRegionOp>,
       RegionOpConversion<omp::ParallelOp>, RegionOpConversion<omp::WsloopOp>,
       RegionOpConversion<omp::SectionsOp>, RegionOpConversion<omp::SectionOp>,
diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
index 332f0a2eecfc..4496c2bc5fe8 100644
--- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
+++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
@@ -15,6 +15,7 @@
 #include <type_traits>
 
 #include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp
index 84ae4b52dcf4..7f3e43d0b4cd 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp
@@ -12,6 +12,7 @@
 
 #include "mlir/Dialect/Affine/LoopFusionUtils.h"
 #include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
diff --git a/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp
index 71eb36bb07a6..fbe2ecab8adc 100644
--- a/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp
+++ b/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp
@@ -19,6 +19,16 @@ using namespace mlir;
 using namespace mlir::arith;
 using namespace mlir::intrange;
 
+static intrange::OverflowFlags
+convertArithOverflowFlags(arith::IntegerOverflowFlags flags) {
+  intrange::OverflowFlags retFlags = intrange::OverflowFlags::None;
+  if (bitEnumContainsAny(flags, arith::IntegerOverflowFlags::nsw))
+    retFlags |= intrange::OverflowFlags::Nsw;
+  if (bitEnumContainsAny(flags, arith::IntegerOverflowFlags::nuw))
+    retFlags |= intrange::OverflowFlags::Nuw;
+  return retFlags;
+}
+
 //===----------------------------------------------------------------------===//
 // ConstantOp
 //===----------------------------------------------------------------------===//
@@ -38,7 +48,8 @@ void arith::ConstantOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
 
 void arith::AddIOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
                                       SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), inferAdd(argRanges));
+  setResultRange(getResult(), inferAdd(argRanges, convertArithOverflowFlags(
+                                                      getOverflowFlags())));
 }
 
 //===----------------------------------------------------------------------===//
@@ -47,7 +58,8 @@ void arith::AddIOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
 
 void arith::SubIOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
                                       SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), inferSub(argRanges));
+  setResultRange(getResult(), inferSub(argRanges, convertArithOverflowFlags(
+                                                      getOverflowFlags())));
 }
 
 //===----------------------------------------------------------------------===//
@@ -56,7 +68,8 @@ void arith::SubIOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
 
 void arith::MulIOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
                                       SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), inferMul(argRanges));
+  setResultRange(getResult(), inferMul(argRanges, convertArithOverflowFlags(
+                                                      getOverflowFlags())));
 }
 
 //===----------------------------------------------------------------------===//
@@ -302,7 +315,8 @@ void arith::SelectOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
 
 void arith::ShLIOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
                                       SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), inferShl(argRanges));
+  setResultRange(getResult(), inferShl(argRanges, convertArithOverflowFlags(
+                                                      getOverflowFlags())));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp b/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp
index acbbbe9932e1..733e758b4390 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp
@@ -46,6 +46,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/Liveness.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/ArmSME/IR/ArmSME.h"
 #include "mlir/Dialect/ArmSME/Transforms/Passes.h"
 #include "mlir/Dialect/ArmSME/Transforms/Transforms.h"
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
index db1974ddb377..f4573030a457 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Config/mlir-config.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
@@ -39,7 +38,7 @@
 
 using namespace mlir;
 
-#if MLIR_ENABLE_CUDA_CONVERSIONS
+#if LLVM_HAS_NVPTX_TARGET
 namespace {
 
 //===----------------------------------------------------------------------===//
@@ -128,4 +127,4 @@ void mlir::gpu::registerGPUToNVVMPipeline() {
       buildLowerToNVVMPassPipeline);
 }
 
-#endif // MLIR_ENABLE_CUDA_CONVERSIONS
+#endif // LLVM_HAS_NVPTX_TARGET
diff --git a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
index 836e939a8295..1e7596e8cc4a 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
@@ -13,7 +13,6 @@
 
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 
-#include "mlir/Config/mlir-config.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -49,7 +48,7 @@ void GpuModuleToBinaryPass::getDependentDialects(
   // Register all GPU related translations.
   registry.insert<gpu::GPUDialect>();
   registry.insert<LLVM::LLVMDialect>();
-#if MLIR_ENABLE_CUDA_CONVERSIONS
+#if LLVM_HAS_NVPTX_TARGET
   registry.insert<NVVM::NVVMDialect>();
 #endif
 #if MLIR_ENABLE_ROCM_CONVERSIONS
diff --git a/mlir/lib/Dialect/Index/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/Index/IR/InferIntRangeInterfaceImpls.cpp
index b6b8a136791c..64adb6b85052 100644
--- a/mlir/lib/Dialect/Index/IR/InferIntRangeInterfaceImpls.cpp
+++ b/mlir/lib/Dialect/Index/IR/InferIntRangeInterfaceImpls.cpp
@@ -44,19 +44,32 @@ void BoolConstantOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
 // we take the 64-bit result).
 //===----------------------------------------------------------------------===//
 
+// Some arithmetic inference functions allow specifying special overflow / wrap
+// behavior. We do not require this for the IndexOps and use this helper to call
+// the inference function without any `OverflowFlags`.
+static std::function<ConstantIntRanges(ArrayRef<ConstantIntRanges>)>
+inferWithoutOverflowFlags(InferRangeWithOvfFlagsFn inferWithOvfFn) {
+  return [inferWithOvfFn](ArrayRef<ConstantIntRanges> argRanges) {
+    return inferWithOvfFn(argRanges, OverflowFlags::None);
+  };
+}
+
 void AddOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
                               SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), inferIndexOp(inferAdd, argRanges, CmpMode::Both));
+  setResultRange(getResult(), inferIndexOp(inferWithoutOverflowFlags(inferAdd),
+                                           argRanges, CmpMode::Both));
 }
 
 void SubOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
                               SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), inferIndexOp(inferSub, argRanges, CmpMode::Both));
+  setResultRange(getResult(), inferIndexOp(inferWithoutOverflowFlags(inferSub),
+                                           argRanges, CmpMode::Both));
 }
 
 void MulOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
                               SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), inferIndexOp(inferMul, argRanges, CmpMode::Both));
+  setResultRange(getResult(), inferIndexOp(inferWithoutOverflowFlags(inferMul),
+                                           argRanges, CmpMode::Both));
 }
 
 void DivUOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
@@ -127,7 +140,8 @@ void MinUOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
 
 void ShlOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
                               SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), inferIndexOp(inferShl, argRanges, CmpMode::Both));
+  setResultRange(getResult(), inferIndexOp(inferWithoutOverflowFlags(inferShl),
+                                           argRanges, CmpMode::Both));
 }
 
 void ShrSOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt b/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
index c80494a44011..728885fcbeaf 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
@@ -6,7 +6,6 @@ add_mlir_dialect_library(MLIRLLVMIRTransforms
   LegalizeForExport.cpp
   OptimizeForNVVM.cpp
   RequestCWrappers.cpp
-  TypeConsistency.cpp
 
   DEPENDS
   MLIRLLVMPassIncGen
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp
deleted file mode 100644
index 0a372ad0c52f..000000000000
--- a/mlir/lib/Dialect/LLVMIR/Transforms/TypeConsistency.cpp
+++ /dev/null
@@ -1,575 +0,0 @@
-//===- TypeConsistency.cpp - Rewrites to improve type consistency ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/LLVMIR/Transforms/TypeConsistency.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "llvm/ADT/TypeSwitch.h"
-
-namespace mlir {
-namespace LLVM {
-#define GEN_PASS_DEF_LLVMTYPECONSISTENCY
-#include "mlir/Dialect/LLVMIR/Transforms/Passes.h.inc"
-} // namespace LLVM
-} // namespace mlir
-
-using namespace mlir;
-using namespace LLVM;
-
-//===----------------------------------------------------------------------===//
-// Utils
-//===----------------------------------------------------------------------===//
-
-/// Checks that a pointer value has a pointee type hint consistent with the
-/// expected type. Returns the type it actually hints to if it differs, or
-/// nullptr if the type is consistent or impossible to analyze.
-static Type isElementTypeInconsistent(Value addr, Type expectedType) {
-  auto defOp = dyn_cast_or_null<GetResultPtrElementType>(addr.getDefiningOp());
-  if (!defOp)
-    return nullptr;
-
-  Type elemType = defOp.getResultPtrElementType();
-  if (!elemType)
-    return nullptr;
-
-  if (elemType == expectedType)
-    return nullptr;
-
-  return elemType;
-}
-
-//===----------------------------------------------------------------------===//
-// CanonicalizeAlignedGep
-//===----------------------------------------------------------------------===//
-
-/// Returns the amount of bytes the provided GEP elements will offset the
-/// pointer by. Returns nullopt if the offset could not be computed.
-static std::optional<uint64_t> gepToByteOffset(DataLayout &layout, GEPOp gep) {
-
-  SmallVector<uint32_t> indices;
-  // Ensures all indices are static and fetches them.
-  for (auto index : gep.getIndices()) {
-    IntegerAttr indexInt = llvm::dyn_cast_if_present<IntegerAttr>(index);
-    if (!indexInt)
-      return std::nullopt;
-    int32_t gepIndex = indexInt.getInt();
-    if (gepIndex < 0)
-      return std::nullopt;
-    indices.push_back(static_cast<uint32_t>(gepIndex));
-  }
-
-  uint64_t offset = indices[0] * layout.getTypeSize(gep.getElemType());
-
-  Type currentType = gep.getElemType();
-  for (uint32_t index : llvm::drop_begin(indices)) {
-    bool shouldCancel =
-        TypeSwitch<Type, bool>(currentType)
-            .Case([&](LLVMArrayType arrayType) {
-              if (arrayType.getNumElements() <= index)
-                return true;
-              offset += index * layout.getTypeSize(arrayType.getElementType());
-              currentType = arrayType.getElementType();
-              return false;
-            })
-            .Case([&](LLVMStructType structType) {
-              ArrayRef<Type> body = structType.getBody();
-              if (body.size() <= index)
-                return true;
-              for (uint32_t i = 0; i < index; i++) {
-                if (!structType.isPacked())
-                  offset = llvm::alignTo(offset,
-                                         layout.getTypeABIAlignment(body[i]));
-                offset += layout.getTypeSize(body[i]);
-              }
-              currentType = body[index];
-              return false;
-            })
-            .Default([](Type) { return true; });
-
-    if (shouldCancel)
-      return std::nullopt;
-  }
-
-  return offset;
-}
-
-/// Fills in `equivalentIndicesOut` with GEP indices that would be equivalent to
-/// offsetting a pointer by `offset` bytes, assuming the GEP has `base` as base
-/// type.
-static LogicalResult
-findIndicesForOffset(DataLayout &layout, Type base, uint64_t offset,
-                     SmallVectorImpl<GEPArg> &equivalentIndicesOut) {
-
-  uint64_t baseSize = layout.getTypeSize(base);
-  uint64_t rootIndex = offset / baseSize;
-  if (rootIndex > std::numeric_limits<uint32_t>::max())
-    return failure();
-  equivalentIndicesOut.push_back(rootIndex);
-
-  uint64_t distanceToStart = rootIndex * baseSize;
-
-#ifndef NDEBUG
-  auto isWithinCurrentType = [&](Type currentType) {
-    return offset < distanceToStart + layout.getTypeSize(currentType);
-  };
-#endif
-
-  Type currentType = base;
-  while (distanceToStart < offset) {
-    // While an index that does not perfectly align with offset has not been
-    // reached...
-
-    assert(isWithinCurrentType(currentType));
-
-    bool shouldCancel =
-        TypeSwitch<Type, bool>(currentType)
-            .Case([&](LLVMArrayType arrayType) {
-              // Find which element of the array contains the offset.
-              uint64_t elemSize =
-                  layout.getTypeSize(arrayType.getElementType());
-              uint64_t index = (offset - distanceToStart) / elemSize;
-              equivalentIndicesOut.push_back(index);
-              distanceToStart += index * elemSize;
-
-              // Then, try to find where in the element the offset is. If the
-              // offset is exactly the beginning of the element, the loop is
-              // complete.
-              currentType = arrayType.getElementType();
-
-              // Only continue if the element in question can be indexed using
-              // an i32.
-              return index > std::numeric_limits<uint32_t>::max();
-            })
-            .Case([&](LLVMStructType structType) {
-              ArrayRef<Type> body = structType.getBody();
-              uint32_t index = 0;
-
-              // Walk over the elements of the struct to find in which of them
-              // the offset is.
-              for (Type elem : body) {
-                uint64_t elemSize = layout.getTypeSize(elem);
-                if (!structType.isPacked()) {
-                  distanceToStart = llvm::alignTo(
-                      distanceToStart, layout.getTypeABIAlignment(elem));
-                  // If the offset is in padding, cancel the rewrite.
-                  if (offset < distanceToStart)
-                    return true;
-                }
-
-                if (offset < distanceToStart + elemSize) {
-                  // The offset is within this element, stop iterating the
-                  // struct and look within the current element.
-                  equivalentIndicesOut.push_back(index);
-                  currentType = elem;
-                  return false;
-                }
-
-                // The offset is not within this element, continue walking over
-                // the struct.
-                distanceToStart += elemSize;
-                index++;
-              }
-
-              // The offset was supposed to be within this struct but is not.
-              // This can happen if the offset points into final padding.
-              // Anyway, nothing can be done.
-              return true;
-            })
-            .Default([](Type) {
-              // If the offset is within a type that cannot be split, no indices
-              // will yield this offset. This can happen if the offset is not
-              // perfectly aligned with a leaf type.
-              // TODO: support vectors.
-              return true;
-            });
-
-    if (shouldCancel)
-      return failure();
-  }
-
-  return success();
-}
-
-/// Returns the consistent type for the GEP if the GEP is not type-consistent.
-/// Returns failure if the GEP is already consistent.
-static FailureOr<Type> getRequiredConsistentGEPType(GEPOp gep) {
-  // GEP of typed pointers are not supported.
-  if (!gep.getElemType())
-    return failure();
-
-  std::optional<Type> maybeBaseType = gep.getElemType();
-  if (!maybeBaseType)
-    return failure();
-  Type baseType = *maybeBaseType;
-
-  Type typeHint = isElementTypeInconsistent(gep.getBase(), baseType);
-  if (!typeHint)
-    return failure();
-  return typeHint;
-}
-
-LogicalResult
-CanonicalizeAlignedGep::matchAndRewrite(GEPOp gep,
-                                        PatternRewriter &rewriter) const {
-  FailureOr<Type> typeHint = getRequiredConsistentGEPType(gep);
-  if (failed(typeHint)) {
-    // GEP is already canonical, nothing to do here.
-    return failure();
-  }
-
-  DataLayout layout = DataLayout::closest(gep);
-  std::optional<uint64_t> desiredOffset = gepToByteOffset(layout, gep);
-  if (!desiredOffset)
-    return failure();
-
-  SmallVector<GEPArg> newIndices;
-  if (failed(
-          findIndicesForOffset(layout, *typeHint, *desiredOffset, newIndices)))
-    return failure();
-
-  rewriter.replaceOpWithNewOp<GEPOp>(
-      gep, LLVM::LLVMPointerType::get(getContext()), *typeHint, gep.getBase(),
-      newIndices, gep.getInbounds());
-
-  return success();
-}
-
-namespace {
-/// Class abstracting over both array and struct types, turning each into ranges
-/// of their sub-types.
-class DestructurableTypeRange
-    : public llvm::indexed_accessor_range<DestructurableTypeRange,
-                                          DestructurableTypeInterface, Type,
-                                          Type *, Type> {
-
-  using Base = llvm::indexed_accessor_range<
-      DestructurableTypeRange, DestructurableTypeInterface, Type, Type *, Type>;
-
-public:
-  using Base::Base;
-
-  /// Constructs a DestructurableTypeRange from either a LLVMStructType or
-  /// LLVMArrayType.
-  explicit DestructurableTypeRange(DestructurableTypeInterface base)
-      : Base(base, 0, [&]() -> ptrdiff_t {
-          return TypeSwitch<DestructurableTypeInterface, ptrdiff_t>(base)
-              .Case([](LLVMStructType structType) {
-                return structType.getBody().size();
-              })
-              .Case([](LLVMArrayType arrayType) {
-                return arrayType.getNumElements();
-              })
-              .Default([](auto) -> ptrdiff_t {
-                llvm_unreachable(
-                    "Only LLVMStructType or LLVMArrayType supported");
-              });
-        }()) {}
-
-  /// Returns true if this is a range over a packed struct.
-  bool isPacked() const {
-    if (auto structType = dyn_cast<LLVMStructType>(getBase()))
-      return structType.isPacked();
-    return false;
-  }
-
-private:
-  static Type dereference(DestructurableTypeInterface base, ptrdiff_t index) {
-    // i32 chosen because the implementations of ArrayType and StructType
-    // specifically expect it to be 32 bit. They will fail otherwise.
-    Type result = base.getTypeAtIndex(
-        IntegerAttr::get(IntegerType::get(base.getContext(), 32), index));
-    assert(result && "Should always succeed");
-    return result;
-  }
-
-  friend Base;
-};
-} // namespace
-
-/// Returns the list of elements of `destructurableType` that are written to by
-/// a store operation writing `storeSize` bytes at `storeOffset`.
-/// `storeOffset` is required to cleanly point to an immediate element within
-/// the type. If the write operation were to write to any padding, write beyond
-/// the aggregate or partially write to a non-aggregate, failure is returned.
-static FailureOr<DestructurableTypeRange>
-getWrittenToFields(const DataLayout &dataLayout,
-                   DestructurableTypeInterface destructurableType,
-                   unsigned storeSize, unsigned storeOffset) {
-  DestructurableTypeRange destructurableTypeRange(destructurableType);
-
-  unsigned currentOffset = 0;
-  for (; !destructurableTypeRange.empty();
-       destructurableTypeRange = destructurableTypeRange.drop_front()) {
-    Type type = destructurableTypeRange.front();
-    if (!destructurableTypeRange.isPacked()) {
-      unsigned alignment = dataLayout.getTypeABIAlignment(type);
-      currentOffset = llvm::alignTo(currentOffset, alignment);
-    }
-
-    // currentOffset is guaranteed to be equal to offset since offset is either
-    // 0 or stems from a type-consistent GEP indexing into just a single
-    // aggregate.
-    if (currentOffset == storeOffset)
-      break;
-
-    assert(currentOffset < storeOffset &&
-           "storeOffset should cleanly point into an immediate field");
-
-    currentOffset += dataLayout.getTypeSize(type);
-  }
-
-  size_t exclusiveEnd = 0;
-  for (; exclusiveEnd < destructurableTypeRange.size() && storeSize > 0;
-       exclusiveEnd++) {
-    if (!destructurableTypeRange.isPacked()) {
-      unsigned alignment =
-          dataLayout.getTypeABIAlignment(destructurableTypeRange[exclusiveEnd]);
-      // No padding allowed inbetween fields at this point in time.
-      if (!llvm::isAligned(llvm::Align(alignment), currentOffset))
-        return failure();
-    }
-
-    unsigned fieldSize =
-        dataLayout.getTypeSize(destructurableTypeRange[exclusiveEnd]);
-    if (fieldSize > storeSize) {
-      // Partial writes into an aggregate are okay since subsequent pattern
-      // applications can further split these up into writes into the
-      // sub-elements.
-      auto subAggregate = dyn_cast<DestructurableTypeInterface>(
-          destructurableTypeRange[exclusiveEnd]);
-      if (!subAggregate)
-        return failure();
-
-      // Avoid splitting redundantly by making sure the store into the
-      // aggregate can actually be split.
-      if (failed(getWrittenToFields(dataLayout, subAggregate, storeSize,
-                                    /*storeOffset=*/0)))
-        return failure();
-
-      return destructurableTypeRange.take_front(exclusiveEnd + 1);
-    }
-    currentOffset += fieldSize;
-    storeSize -= fieldSize;
-  }
-
-  // If the storeSize is not 0 at this point we are  writing past the aggregate
-  // as a whole. Abort.
-  if (storeSize > 0)
-    return failure();
-  return destructurableTypeRange.take_front(exclusiveEnd);
-}
-
-/// Splits a store of the vector `value` into `address` at `storeOffset` into
-/// multiple stores of each element with the goal of each generated store
-/// becoming type-consistent through subsequent pattern applications.
-static void splitVectorStore(const DataLayout &dataLayout, Location loc,
-                             RewriterBase &rewriter, Value address,
-                             TypedValue<VectorType> value,
-                             unsigned storeOffset) {
-  VectorType vectorType = value.getType();
-  unsigned elementSize = dataLayout.getTypeSize(vectorType.getElementType());
-
-  // Extract every element in the vector and store it in the given address.
-  for (size_t index : llvm::seq<size_t>(0, vectorType.getNumElements())) {
-    auto pos =
-        rewriter.create<ConstantOp>(loc, rewriter.getI32IntegerAttr(index));
-    auto extractOp = rewriter.create<ExtractElementOp>(loc, value, pos);
-
-    // For convenience, we do indexing by calculating the final byte offset.
-    // Other patterns will turn this into a type-consistent GEP.
-    auto gepOp = rewriter.create<GEPOp>(
-        loc, address.getType(), rewriter.getI8Type(), address,
-        ArrayRef<GEPArg>{
-            static_cast<int32_t>(storeOffset + index * elementSize)});
-
-    rewriter.create<StoreOp>(loc, extractOp, gepOp);
-  }
-}
-
-/// Splits a store of the integer `value` into `address` at `storeOffset` into
-/// multiple stores to each 'writtenToFields', making each store operation
-/// type-consistent.
-static void splitIntegerStore(const DataLayout &dataLayout, Location loc,
-                              RewriterBase &rewriter, Value address,
-                              Value value, unsigned storeSize,
-                              unsigned storeOffset,
-                              DestructurableTypeRange writtenToFields) {
-  unsigned currentOffset = storeOffset;
-  for (Type type : writtenToFields) {
-    unsigned fieldSize = dataLayout.getTypeSize(type);
-
-    // Extract the data out of the integer by first shifting right and then
-    // truncating it.
-    auto pos = rewriter.create<ConstantOp>(
-        loc, rewriter.getIntegerAttr(value.getType(),
-                                     (currentOffset - storeOffset) * 8));
-
-    auto shrOp = rewriter.create<LShrOp>(loc, value, pos);
-
-    // If we are doing a partial write into a direct field the remaining
-    // `storeSize` will be less than the size of the field. We have to truncate
-    // to the `storeSize` to avoid creating a store that wasn't in the original
-    // code.
-    IntegerType fieldIntType =
-        rewriter.getIntegerType(std::min(fieldSize, storeSize) * 8);
-    Value valueToStore = rewriter.create<TruncOp>(loc, fieldIntType, shrOp);
-
-    // We create an `i8` indexed GEP here as that is the easiest (offset is
-    // already known). Other patterns turn this into a type-consistent GEP.
-    auto gepOp = rewriter.create<GEPOp>(
-        loc, address.getType(), rewriter.getI8Type(), address,
-        ArrayRef<GEPArg>{static_cast<int32_t>(currentOffset)});
-    rewriter.create<StoreOp>(loc, valueToStore, gepOp);
-
-    // No need to care about padding here since we already checked previously
-    // that no padding exists in this range.
-    currentOffset += fieldSize;
-    storeSize -= fieldSize;
-  }
-}
-
-LogicalResult SplitStores::matchAndRewrite(StoreOp store,
-                                           PatternRewriter &rewriter) const {
-  Type sourceType = store.getValue().getType();
-  if (!isa<IntegerType, VectorType>(sourceType)) {
-    // We currently only support integer and vector sources.
-    return failure();
-  }
-
-  Type typeHint = isElementTypeInconsistent(store.getAddr(), sourceType);
-  if (!typeHint) {
-    // Nothing to do, since it is already consistent.
-    return failure();
-  }
-
-  auto dataLayout = DataLayout::closest(store);
-
-  unsigned storeSize = dataLayout.getTypeSize(sourceType);
-  unsigned offset = 0;
-  Value address = store.getAddr();
-  if (auto gepOp = address.getDefiningOp<GEPOp>()) {
-    // Currently only handle canonical GEPs with exactly two indices,
-    // indexing a single aggregate deep.
-    // If the GEP is not canonical we have to fail, otherwise we would not
-    // create type-consistent IR.
-    if (gepOp.getIndices().size() != 2 ||
-        succeeded(getRequiredConsistentGEPType(gepOp)))
-      return failure();
-
-    // If the size of the element indexed by the  GEP is smaller than the store
-    // size, it is pointing into the middle of an aggregate with the store
-    // storing into multiple adjacent elements. Destructure into the base
-    // address of the aggregate with a store offset.
-    if (storeSize > dataLayout.getTypeSize(gepOp.getResultPtrElementType())) {
-      std::optional<uint64_t> byteOffset = gepToByteOffset(dataLayout, gepOp);
-      if (!byteOffset)
-        return failure();
-
-      offset = *byteOffset;
-      typeHint = gepOp.getElemType();
-      address = gepOp.getBase();
-    }
-  }
-
-  auto destructurableType = dyn_cast<DestructurableTypeInterface>(typeHint);
-  if (!destructurableType)
-    return failure();
-
-  FailureOr<DestructurableTypeRange> writtenToElements =
-      getWrittenToFields(dataLayout, destructurableType, storeSize, offset);
-  if (failed(writtenToElements))
-    return failure();
-
-  if (writtenToElements->size() <= 1) {
-    // Other patterns should take care of this case, we are only interested in
-    // splitting element stores.
-    return failure();
-  }
-
-  if (isa<IntegerType>(sourceType)) {
-    splitIntegerStore(dataLayout, store.getLoc(), rewriter, address,
-                      store.getValue(), storeSize, offset, *writtenToElements);
-    rewriter.eraseOp(store);
-    return success();
-  }
-
-  // Add a reasonable bound to not split very large vectors that would end up
-  // generating lots of code.
-  if (dataLayout.getTypeSizeInBits(sourceType) > maxVectorSplitSize)
-    return failure();
-
-  // Vector types are simply split into its elements and new stores generated
-  // with those. Subsequent pattern applications will split these stores further
-  // if required.
-  splitVectorStore(dataLayout, store.getLoc(), rewriter, address,
-                   cast<TypedValue<VectorType>>(store.getValue()), offset);
-  rewriter.eraseOp(store);
-  return success();
-}
-
-LogicalResult SplitGEP::matchAndRewrite(GEPOp gepOp,
-                                        PatternRewriter &rewriter) const {
-  FailureOr<Type> typeHint = getRequiredConsistentGEPType(gepOp);
-  if (succeeded(typeHint) || gepOp.getIndices().size() <= 2) {
-    // GEP is not canonical or a single aggregate deep, nothing to do here.
-    return failure();
-  }
-
-  auto indexToGEPArg =
-      [](GEPIndicesAdaptor<ValueRange>::value_type index) -> GEPArg {
-    if (auto integerAttr = dyn_cast<IntegerAttr>(index))
-      return integerAttr.getValue().getSExtValue();
-    return cast<Value>(index);
-  };
-
-  GEPIndicesAdaptor<ValueRange> indices = gepOp.getIndices();
-
-  auto splitIter = std::next(indices.begin(), 2);
-
-  // Split of the first GEP using the first two indices.
-  auto subGepOp = rewriter.create<GEPOp>(
-      gepOp.getLoc(), gepOp.getType(), gepOp.getElemType(), gepOp.getBase(),
-      llvm::map_to_vector(llvm::make_range(indices.begin(), splitIter),
-                          indexToGEPArg),
-      gepOp.getInbounds());
-
-  // The second GEP indexes on the result pointer element type of the previous
-  // with all the remaining indices and a zero upfront. If this GEP has more
-  // than two indices remaining it'll be further split in subsequent pattern
-  // applications.
-  SmallVector<GEPArg> newIndices = {0};
-  llvm::transform(llvm::make_range(splitIter, indices.end()),
-                  std::back_inserter(newIndices), indexToGEPArg);
-  rewriter.replaceOpWithNewOp<GEPOp>(gepOp, gepOp.getType(),
-                                     subGepOp.getResultPtrElementType(),
-                                     subGepOp, newIndices, gepOp.getInbounds());
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// Type consistency pass
-//===----------------------------------------------------------------------===//
-
-namespace {
-struct LLVMTypeConsistencyPass
-    : public LLVM::impl::LLVMTypeConsistencyBase<LLVMTypeConsistencyPass> {
-  void runOnOperation() override {
-    RewritePatternSet rewritePatterns(&getContext());
-    rewritePatterns.add<CanonicalizeAlignedGep>(&getContext());
-    rewritePatterns.add<SplitStores>(&getContext(), maxVectorSplitSize);
-    rewritePatterns.add<SplitGEP>(&getContext());
-    FrozenRewritePatternSet frozen(std::move(rewritePatterns));
-
-    if (failed(applyPatternsAndFoldGreedily(getOperation(), frozen)))
-      signalPassFailure();
-  }
-};
-} // namespace
-
-std::unique_ptr<Pass> LLVM::createTypeConsistencyPass() {
-  return std::make_unique<LLVMTypeConsistencyPass>();
-}
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 13582a140a96..9b3121774ab3 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -2523,7 +2523,8 @@ DiagnosedSilenceableFailure transform::TileReductionUsingForOp::applyToOne(
 
   if (failed(result))
     return emitDefaultSilenceableFailure(target);
-  results.push_back(result->initialOp);
+  for (Value initValue : result->initialValues)
+    results.push_back(initValue.getDefiningOp());
   results.push_back(result->parallelTiledOp);
   results.push_back(result->mergeOp);
   results.push_back(result->loops.front());
@@ -2574,7 +2575,8 @@ DiagnosedSilenceableFailure transform::TileReductionUsingForallOp::applyToOne(
     diag.attachNote(target.getLoc()) << "target operation";
     return diag;
   }
-  results.push_back(result->initialOp);
+  for (Value initValue : result->initialValues)
+    results.push_back(initValue.getDefiningOp());
   results.push_back(result->parallelTiledOp);
   results.push_back(result->mergeOp);
   results.push_back(result->loops);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/BlockPackMatmul.cpp b/mlir/lib/Dialect/Linalg/Transforms/BlockPackMatmul.cpp
index c07d1387ec75..91d4efa3372b 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/BlockPackMatmul.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/BlockPackMatmul.cpp
@@ -244,8 +244,7 @@ struct BlockPackMatmul<linalg::GenericOp>
   LogicalResult matchAndRewrite(linalg::GenericOp linalgOp,
                                 PatternRewriter &rewriter) const override {
     // Match suitable generics.
-    if (failed(linalg::detail::verifyContractionInterface(
-            linalgOp.getOperation()))) {
+    if (!linalg::isaContractionOpInterface(linalgOp)) {
       return rewriter.notifyMatchFailure(linalgOp, "not a contraction");
     }
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/MeshShardingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/MeshShardingInterfaceImpl.cpp
index 146e88076566..24001c543f35 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/MeshShardingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/MeshShardingInterfaceImpl.cpp
@@ -36,6 +36,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include <iterator>
+#include <numeric>
 #include <optional>
 #include <utility>
 
@@ -155,12 +156,12 @@ static Value createDestinationPassingStyleInitOperand(
         tensor::getMixedSizes(builder, builder.getLoc(), spmdizedOperand);
     PartialReductionOpInterface partialReductionIface =
         llvm::cast<PartialReductionOpInterface>(op.getOperation());
-    FailureOr<Operation *> reductionNeutralTensorOp =
+    assert(op->getNumResults() == 1 && "Multiple results not supported.");
+    FailureOr<SmallVector<Value>> reductionNeutralTensor =
         partialReductionIface.generateInitialTensorForPartialReduction(
             builder, builder.getLoc(), shape, {});
-    assert(succeeded(reductionNeutralTensorOp));
-    builder.create<scf::YieldOp>(
-        reductionNeutralTensorOp.value()->getResult(0));
+    assert(succeeded(reductionNeutralTensor));
+    builder.create<scf::YieldOp>(reductionNeutralTensor.value());
   }
   return ifOp.getResult(0);
 }
@@ -173,8 +174,7 @@ static SmallVector<Value> createDestinationPassingStyleInitOperands(
     ImplicitLocOpBuilder &builder) {
   // TODO: add support for multiple destination passing style initial value
   // operands.
-  // PartialReductionOpInterface::generateInitialTensorForPartialReduction
-  // needs to also support multiple DPS initial operands.
+  assert(op.getNumDpsInits() == 1 && "Multiple initial values not supported.");
   SmallVector<Value> newOperands = llvm::to_vector(spmdizedOperands);
   auto operandIdx = op.getDpsInitOperand(0)->getOperandNumber();
   Value spmdizedInitOperand =
@@ -279,6 +279,20 @@ struct StructuredOpShardingInterface
     return res;
   }
 
+  SmallVector<ReductionKind>
+  getReductionLoopIteratorKinds(Operation *op) const {
+    LinalgOp linalgOp = llvm::cast<LinalgOp>(op);
+    SmallVector<utils::IteratorType> iteratorTypes =
+        linalgOp.getIteratorTypesArray();
+    unsigned reductionItersCount = std::accumulate(
+        iteratorTypes.begin(), iteratorTypes.end(), 0,
+        [](unsigned count, utils::IteratorType iter) {
+          return count + (iter == utils::IteratorType::reduction);
+        });
+    mesh::ReductionKind reductionKind = getReductionKindOfLinalgOp(linalgOp);
+    return SmallVector<ReductionKind>(reductionItersCount, reductionKind);
+  }
+
   LogicalResult spmdize(Operation *op, ArrayRef<Value> spmdizedOperands,
                         ArrayRef<MeshShardingAttr> operandShardings,
                         ArrayRef<MeshShardingAttr> resultShardings,
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
index df4089d61bfd..fd314ef9f813 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -692,12 +692,13 @@ FailureOr<linalg::ForallReductionTilingResult> linalg::tileReductionUsingForall(
         op, "reduction dimension must be mapped to threads");
 
   // 1. Create the inital tensor value.
-  FailureOr<Operation *> identityTensor =
+  FailureOr<SmallVector<Value>> maybeInitTensors =
       op.generateInitialTensorForPartialReduction(b, loc, numThreads,
                                                   reductionDim);
-  if (failed(identityTensor))
-    return b.notifyMatchFailure(op,
-                                "cannot create a tensor of identity value.");
+  if (failed(maybeInitTensors))
+    return b.notifyMatchFailure(
+        op, "Failed to create inital tensors for partial reduction");
+  SmallVector<Value> &initTensors = maybeInitTensors.value();
 
   // Gather destination tensors.
   SmallVector<Value> dest;
@@ -715,8 +716,8 @@ FailureOr<linalg::ForallReductionTilingResult> linalg::tileReductionUsingForall(
 
   // 2. Create the ForallOp with an empty region.
   scf::ForallOp forallOp = b.create<scf::ForallOp>(
-      loc, getAsOpFoldResult(materializedNonZeroNumThreads),
-      (*identityTensor)->getResults(), mapping);
+      loc, getAsOpFoldResult(materializedNonZeroNumThreads), initTensors,
+      mapping);
 
   // 3. Calculate the tile offsets and sizes for the subsequent loop that will
   // be nested under `forallOp`.
@@ -726,7 +727,7 @@ FailureOr<linalg::ForallReductionTilingResult> linalg::tileReductionUsingForall(
                                /*nominalTileSizes=*/std::nullopt, tiledOffsets,
                                tiledSizes);
 
-  // 4. Clone the tileable op and update its destination operands to use the
+  // 4b. Clone the tileable op and update its destination operands to use the
   // output bbArgs of the ForallOp.
   SmallVector<Value> tilingResults;
   ArrayRef<BlockArgument> destBbArgs = forallOp.getRegionIterArgs();
@@ -838,7 +839,7 @@ FailureOr<linalg::ForallReductionTilingResult> linalg::tileReductionUsingForall(
 
   // 8. Return.
   ForallReductionTilingResult results;
-  results.initialOp = *identityTensor;
+  results.initialValues = initTensors;
   results.loops = forallOp;
   results.parallelTiledOp = tiledOp;
   results.mergeOp = mergeOp;
diff --git a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
index bd870d4f982e..f512be46cc13 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
@@ -250,7 +250,7 @@ template <typename LinalgOpTy>
 struct LinalgOpPartialReductionInterface
     : public PartialReductionOpInterface::ExternalModel<
           LinalgOpPartialReductionInterface<LinalgOpTy>, LinalgOpTy> {
-  FailureOr<Operation *> generateInitialTensorForPartialReduction(
+  FailureOr<SmallVector<Value>> generateInitialTensorForPartialReduction(
       Operation *op, OpBuilder &b, Location loc, ArrayRef<OpFoldResult> sizes,
       ArrayRef<int> reductionDims) const {
     auto linalgOp = cast<LinalgOp>(op);
@@ -258,50 +258,58 @@ struct LinalgOpPartialReductionInterface
 
     if (linalgOp.hasPureBufferSemantics())
       return op->emitOpError("expected operation to have tensor semantics");
-    // Insert the new parallel dimension based on the index of the reduction
-    // loops. This could be controlled by user for more flexibility.
 
-    SmallVector<Operation *, 4> combinerOps;
-    if (!matchReduction(linalgOp.getRegionOutputArgs(), 0, combinerOps) ||
-        combinerOps.size() != 1)
-      return op->emitOpError("Failed to anaysis the reduction operation.");
-
-    Operation *reductionOp = combinerOps[0];
-    std::optional<TypedAttr> identity = arith::getNeutralElement(reductionOp);
-    if (!identity.has_value())
-      return op->emitOpError(
-          "Failed to get an identity value for the reduction operation.");
-
-    ArrayRef<int64_t> oldShape =
-        linalgOp.getShape(linalgOp.getDpsInitOperand(0));
-
-    // Calculate the new shape, we insert the new dimensions based on the index
-    // of the reduction dimensions.
-    SmallVector<int64_t> newOutputShape;
-    SmallVector<Value> dynamicDims;
-    int64_t currReductionDims = 0;
-    DenseSet<int> reductionDimsSet(reductionDims.begin(), reductionDims.end());
-    for (int64_t idx :
-         llvm::seq<int64_t>(0, oldShape.size() + reductionDims.size())) {
-      if (reductionDimsSet.contains(idx)) {
-        dispatchIndexOpFoldResults(sizes[idx], dynamicDims, newOutputShape);
-        currReductionDims++;
-        continue;
+    SmallVector<Value> inits;
+    for (int initIdx = 0, e = linalgOp.getNumDpsInits(); initIdx < e;
+         ++initIdx) {
+      // Insert the new parallel dimension based on the index of the reduction
+      // loops. This could be controlled by user for more flexibility.
+      SmallVector<Operation *, 4> combinerOps;
+      if (!matchReduction(linalgOp.getRegionOutputArgs(), initIdx,
+                          combinerOps) ||
+          combinerOps.size() != 1)
+        return op->emitOpError("Failed to anaysis the reduction operation.");
+
+      Operation *reductionOp = combinerOps[0];
+      std::optional<TypedAttr> identity = arith::getNeutralElement(reductionOp);
+      if (!identity.has_value())
+        return op->emitOpError(
+            "Failed to get an identity value for the reduction operation.");
+
+      ArrayRef<int64_t> oldShape =
+          linalgOp.getShape(linalgOp.getDpsInitOperand(initIdx));
+
+      // Calculate the new shape, we insert the new dimensions based on the
+      // index of the reduction dimensions.
+      SmallVector<int64_t> newOutputShape;
+      SmallVector<Value> dynamicDims;
+      int64_t currReductionDims = 0;
+      DenseSet<int> reductionDimsSet(reductionDims.begin(),
+                                     reductionDims.end());
+      for (int64_t idx :
+           llvm::seq<int64_t>(0, oldShape.size() + reductionDims.size())) {
+        if (reductionDimsSet.contains(idx)) {
+          dispatchIndexOpFoldResults(sizes[idx], dynamicDims, newOutputShape);
+          currReductionDims++;
+          continue;
+        }
+        int64_t oldIdx = idx - currReductionDims;
+        int64_t dim = oldShape[oldIdx];
+        newOutputShape.push_back(dim);
+        if (ShapedType::isDynamic(dim))
+          dynamicDims.push_back(b.create<tensor::DimOp>(
+              loc, linalgOp.getDpsInitOperand(initIdx)->get(), oldIdx));
       }
-      int64_t oldIdx = idx - currReductionDims;
-      int64_t dim = oldShape[oldIdx];
-      newOutputShape.push_back(dim);
-      if (ShapedType::isDynamic(dim))
-        dynamicDims.push_back(b.create<tensor::DimOp>(
-            loc, linalgOp.getDpsInitOperand(0)->get(), oldIdx));
+      Value emptyTensor = b.create<tensor::EmptyOp>(
+          loc, newOutputShape,
+          linalgOp.getRegionOutputArgs()[initIdx].getType(), dynamicDims);
+      Value constantOp = b.create<arith::ConstantOp>(loc, *identity);
+      auto identityTensor =
+          b.create<linalg::FillOp>(loc, constantOp, emptyTensor);
+      inits.push_back(identityTensor.getResult(0));
     }
-    Value emptyTensor = b.create<tensor::EmptyOp>(
-        loc, newOutputShape, linalgOp.getRegionOutputArgs()[0].getType(),
-        dynamicDims);
-    Value constantOp = b.create<arith::ConstantOp>(loc, *identity);
-    auto identityTensor =
-        b.create<linalg::FillOp>(loc, constantOp, emptyTensor);
-    return identityTensor.getOperation();
+
+    return inits;
   }
 
   Operation *tileToPartialReduction(Operation *op, OpBuilder &b, Location loc,
@@ -312,44 +320,64 @@ struct LinalgOpPartialReductionInterface
     OpBuilder::InsertionGuard guard(b);
     auto linalgOp = cast<LinalgOp>(op);
 
-    AffineMap oldOutputMap =
-        linalgOp.getMatchingIndexingMap(linalgOp.getDpsInitOperand(0));
-    SmallVector<AffineExpr> outputExpr(oldOutputMap.getNumResults() +
-                                       reductionDims.size());
-
-    for (int idx : reductionDims)
-      outputExpr[idx] = b.getAffineDimExpr(idx);
-    int currExpr = 0;
-    for (int idx : llvm::seq<int>(0, outputExpr.size())) {
-      if (outputExpr[idx])
-        continue;
-      outputExpr[idx] = oldOutputMap.getResult(currExpr++);
+    // Step 1. Extend init maps to have reduction dimension dims, since we
+    // are converting them to parallel dimensions.
+    SmallVector<AffineMap> newInitMaps;
+    newInitMaps.reserve(linalgOp.getNumDpsInits());
+    for (int idx : llvm::seq<int>(0, linalgOp.getNumDpsInits())) {
+      // TODO: linalg::Generic doesn't have getDpsInitOperands. Can replace
+      // this with a for range loop when we have it.
+      AffineMap newMap =
+          linalgOp.getMatchingIndexingMap(linalgOp.getDpsInitOperand(idx));
+      for (int redPos : reductionDims) {
+        newMap = newMap.insertResult(b.getAffineDimExpr(redPos),
+                                     newMap.getNumResults());
+      }
+      newInitMaps.push_back(newMap);
     }
 
-    // Step 1: Extract a slice of the input operands.
-    SmallVector<Value> valuesToTile = linalgOp.getDpsInputs();
-    SmallVector<Value, 4> tiledOperands = makeTiledShapes(
-        b, loc, linalgOp, valuesToTile, offsets, sizes, {}, true);
+    // Step 2a: Extract a slice of the input operands.
+    SmallVector<Value, 4> tiledInputs = makeTiledShapes(
+        b, loc, linalgOp, linalgOp.getDpsInputs(), offsets, sizes, {}, true);
+
+    // Step 2b: Extract a slice of the init operands.
+    SmallVector<Value, 1> tiledInits;
+    for (auto [valueMap, valueToTile] : llvm::zip_equal(newInitMaps, init)) {
+      int64_t initRank = valueMap.getNumResults();
+      SmallVector<OpFoldResult> initOffset(initRank, b.getIndexAttr(0));
+      SmallVector<OpFoldResult> initStride(initRank, b.getIndexAttr(1));
+      SmallVector<OpFoldResult> initSizes;
+      for (AffineExpr dimExpr : valueMap.getResults()) {
+        auto dim = cast<AffineDimExpr>(dimExpr);
+        initSizes.push_back(sizes[dim.getPosition()]);
+      }
+      // TODO: Use SubsetExtractOpInterface here once available.
+      auto extractSlice = b.create<tensor::ExtractSliceOp>(
+          loc, valueToTile, initOffset, initSizes, initStride);
+      tiledInits.push_back(extractSlice);
+    }
 
-    // Step 2: Extract the accumulator operands
-    SmallVector<OpFoldResult> strides(offsets.size(), b.getIndexAttr(1));
-    SmallVector<OpFoldResult> outOffsets(offsets.size(), b.getIndexAttr(0));
-    // TODO: use SubsetExtractOpInterface once it is available.
-    Value out = b.create<tensor::ExtractSliceOp>(loc, init[0], outOffsets,
-                                                 sizes, strides);
+    // Update the indexing maps.
+    SmallVector<AffineMap> newMaps = linalgOp.getIndexingMapsArray();
+    // Change the init maps.
+    for (int idx : llvm::seq<int>(0, linalgOp.getNumDpsInits())) {
+      // TODO: linalg::Generic doesn't have getDpsInitOperands. Can replace
+      // this with a for range loop when we have it.
+      OpOperand *initOperand = linalgOp.getDpsInitOperand(idx);
+      int64_t mapIdx = linalgOp.getIndexingMapIndex(initOperand);
+      newMaps[mapIdx] = newInitMaps[idx];
+    }
 
-    // Step3. Create a generic op where the reduction dimensions are replaced
-    // by a parallel dimension of the size of reduction.
+    // Step 3. Change the reduction dim iterator types.
     SmallVector<utils::IteratorType> newIteratorTypes =
         linalgOp.getIteratorTypesArray();
     for (int dim : reductionDims)
       newIteratorTypes[dim] = utils::IteratorType::parallel;
-    SmallVector<AffineMap> newMaps = linalgOp.getIndexingMapsArray();
-    newMaps.back() = AffineMap::get(newMaps.back().getNumDims(), 0, outputExpr,
-                                    linalgOp.getContext());
+
+    // Step 4. Create the new generic op.
     auto genericOp =
-        b.create<GenericOp>(loc, TypeRange({out.getType()}), tiledOperands,
-                            ValueRange({out}), newMaps, newIteratorTypes);
+        b.create<GenericOp>(loc, ValueRange(tiledInits).getTypes(), tiledInputs,
+                            tiledInits, newMaps, newIteratorTypes);
     IRMapping mapping;
     op->getRegion(0).cloneInto(&genericOp.getRegion(),
                                genericOp.getRegion().begin(), mapping);
@@ -361,40 +389,53 @@ struct LinalgOpPartialReductionInterface
                              ArrayRef<int> reductionDims) const {
     auto linalgOp = cast<LinalgOp>(op);
 
-    DenseSet<int> reductionDimsSet(reductionDims.begin(), reductionDims.end());
-
-    // Then create a new reduction that only reduce the newly added dimensions
-    // from the previous op.
-    int64_t intermRank = cast<ShapedType>(partialReduce[0].getType()).getRank();
-    AffineMap inputMap = b.getMultiDimIdentityMap(intermRank);
-    SmallVector<utils::IteratorType> reductionIteratorTypes;
-    SmallVector<AffineExpr> exprs;
-
-    for (int64_t i : llvm::seq<int64_t>(0, intermRank)) {
-      if (reductionDimsSet.contains(i)) {
-        reductionIteratorTypes.push_back(utils::IteratorType::reduction);
-      } else {
-        exprs.push_back(b.getAffineDimExpr(i));
-        reductionIteratorTypes.push_back(utils::IteratorType::parallel);
+    // Step 1. Recover the dims that actually need to be merged from the
+    // original operation. We can classify the original iterators as follows:
+    //
+    // parallel                         --> parallel
+    // reduction + not in reductionDims --> parallel (already reduced)
+    // reduction + in reductionDims     --> reduction (will reduce now)
+    SmallVector<utils::IteratorType> iterators(linalgOp.getNumLoops(),
+                                               utils::IteratorType::parallel);
+    for (int redIdx : reductionDims)
+      iterators[redIdx] = utils::IteratorType::reduction;
+
+    // Step 2. For each partial result, create a map to index it. This map
+    // is simply the indexing map for the original result with reductionDims
+    // appended (as produced in tileToPartialReduction).
+    int64_t numInits = linalgOp.getNumDpsInits();
+    SmallVector<AffineMap> indexingMaps(numInits * 2);
+    for (int idx : llvm::seq<int>(0, numInits)) {
+      AffineMap &inputMap = indexingMaps[idx];
+      AffineMap &outputMap = indexingMaps[numInits + idx];
+
+      outputMap =
+          linalgOp.getMatchingIndexingMap(linalgOp.getDpsInitOperand(idx));
+      inputMap = outputMap;
+      for (int redPos : reductionDims) {
+        inputMap = inputMap.insertResult(b.getAffineDimExpr(redPos),
+                                         inputMap.getNumResults());
       }
     }
 
-    AffineMap outputMap =
-        AffineMap::get(intermRank, 0, exprs, op->getContext());
-    SmallVector<AffineMap> reductionMaps = {inputMap, outputMap};
-
-    SmallVector<Operation *, 4> combinerOps;
-    matchReduction(linalgOp.getRegionOutputArgs(), 0, combinerOps);
-    Operation *reductionOp = combinerOps[0];
-
     auto reduction = b.create<GenericOp>(
-        loc, op->getResultTypes(), ValueRange({partialReduce[0]}),
-        linalgOp.getDpsInits(), reductionMaps, reductionIteratorTypes,
-        [reductionOp](OpBuilder &b, Location loc, ValueRange inputs) {
-          Operation *clonedReductionOp = b.clone(*reductionOp);
-          clonedReductionOp->setOperand(0, inputs[0]);
-          clonedReductionOp->setOperand(1, inputs[1]);
-          b.create<linalg::YieldOp>(loc, clonedReductionOp->getResult(0));
+        loc, op->getResultTypes(), partialReduce, linalgOp.getDpsInits(),
+        indexingMaps, iterators,
+        [&linalgOp](OpBuilder &b, Location loc, ValueRange inputs) {
+          int64_t numInits = linalgOp.getNumDpsInits();
+          SmallVector<Value> yieldedValues;
+          for (int idx : llvm::seq<int>(0, numInits)) {
+            // Get the combiner op.
+            SmallVector<Operation *, 4> combinerOps;
+            matchReduction(linalgOp.getRegionOutputArgs(), idx, combinerOps);
+            Operation *clonedReductionOp = b.clone(*combinerOps[0]);
+            // Combine the input at idx and output at numInits + idx.
+            clonedReductionOp->setOperand(0, inputs[idx]);
+            clonedReductionOp->setOperand(1, inputs[numInits + idx]);
+            // Yield.
+            yieldedValues.push_back(clonedReductionOp->getResult(0));
+          }
+          b.create<linalg::YieldOp>(loc, yieldedValues);
         });
     return reduction.getOperation();
   }
diff --git a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
index d4329b401df1..ec1acbbb9349 100644
--- a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
+++ b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp
@@ -20,6 +20,7 @@
 #include "mlir/IR/Location.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Value.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
@@ -28,6 +29,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Casting.h"
 #include <algorithm>
 #include <functional>
 #include <iterator>
@@ -99,7 +101,7 @@ Operation *MeshDialect::materializeConstant(OpBuilder &builder, Attribute value,
 static FailureOr<MeshOp> getMeshAndVerify(Operation *op,
                                           FlatSymbolRefAttr meshSymbol,
                                           SymbolTableCollection &symbolTable) {
-  mesh::MeshOp mesh = getMesh(op, meshSymbol, symbolTable);
+  mesh::MeshOp mesh = getMeshOrNull(op, meshSymbol, symbolTable);
   if (!mesh) {
     return op->emitError() << "Undefined required mesh symbol \""
                            << meshSymbol.getValue() << "\".";
@@ -178,6 +180,88 @@ Type mesh::shardType(Type type, MeshOp mesh, MeshShardingAttr sharding) {
   return type;
 }
 
+void mlir::mesh::maybeInsertTargetShardingAnnotation(MeshShardingAttr sharding,
+                                                     OpOperand &operand,
+                                                     OpBuilder &builder) {
+  OpBuilder::InsertionGuard insertionGuard(builder);
+  Value operandValue = operand.get();
+  Operation *operandOp = operand.getOwner();
+  builder.setInsertionPointAfterValue(operandValue);
+  ShardOp shardOp = dyn_cast<ShardOp>(operandOp);
+  if (shardOp && shardOp.getShard() == sharding &&
+      !shardOp.getAnnotateForUsers()) {
+    // No need for anything the correct sharding is already set.
+    return;
+  }
+
+  auto newShardOp =
+      builder.create<ShardOp>(operandValue.getLoc(), operandValue, sharding,
+                              /*annotate_for_users*/ false);
+  IRRewriter rewriter(builder);
+  rewriter.replaceUsesWithIf(
+      operandValue, newShardOp, [operandOp, operandValue](OpOperand &use) {
+        return use.getOwner() == operandOp && use.get() == operandValue;
+      });
+
+  if (!shardOp || shardOp.getAnnotateForUsers()) {
+    return;
+  }
+
+  auto newShardOp2 = builder.create<ShardOp>(
+      operandValue.getLoc(), newShardOp, sharding, /*annotate_for_users*/ true);
+  rewriter.replaceAllUsesExcept(newShardOp, newShardOp2, newShardOp2);
+}
+
+void mlir::mesh::maybeInsertTargetShardingAnnotation(MeshShardingAttr sharding,
+                                                     OpResult result,
+                                                     OpBuilder &builder) {
+  for (auto &use : llvm::make_early_inc_range(result.getUses())) {
+    maybeInsertTargetShardingAnnotation(sharding, use, builder);
+  }
+}
+
+void mlir::mesh::maybeInsertSourceShardingAnnotation(MeshShardingAttr sharding,
+                                                     OpOperand &operand,
+                                                     OpBuilder &builder) {
+  OpBuilder::InsertionGuard insertionGuard(builder);
+  Value operandValue = operand.get();
+  Operation *operandOp = operand.getOwner();
+  Operation *operandSrcOp = operandValue.getDefiningOp();
+  bool isBlockArg = !operandSrcOp;
+  ShardOp shardOp = dyn_cast_or_null<ShardOp>(operandSrcOp);
+
+  if (shardOp && shardOp.getShard() == sharding &&
+      shardOp.getAnnotateForUsers()) {
+    // No need for anything the correct sharding is already set.
+    return;
+  }
+
+  builder.setInsertionPoint(operandOp);
+  auto newShardOp =
+      builder.create<ShardOp>(operandValue.getLoc(), operandValue, sharding,
+                              /*annotate_for_users*/ true);
+  IRRewriter rewriter(builder);
+  rewriter.replaceUsesWithIf(
+      operandValue, newShardOp, [operandOp, operandValue](OpOperand &use) {
+        return use.getOwner() == operandOp && use.get() == operandValue;
+      });
+
+  if (isBlockArg || !shardOp || !shardOp.getAnnotateForUsers()) {
+    // No need for resharding.
+    return;
+  }
+
+  builder.setInsertionPoint(newShardOp);
+  auto newPreceedingShardOp =
+      builder.create<ShardOp>(operandValue.getLoc(), operandValue, sharding,
+                              /*annotate_for_users*/ false);
+  rewriter.replaceUsesWithIf(newShardOp.getOperand(), newPreceedingShardOp,
+                             [&newShardOp](OpOperand &use) {
+                               return use.getOwner() ==
+                                      newShardOp.getOperation();
+                             });
+}
+
 //===----------------------------------------------------------------------===//
 // mesh.mesh op
 //===----------------------------------------------------------------------===//
@@ -286,6 +370,10 @@ bool MeshShardingAttr::operator==(Attribute rhs) const {
   return rhsAsMeshShardingAttr && *this == rhsAsMeshShardingAttr;
 }
 
+bool MeshShardingAttr::operator!=(Attribute rhs) const {
+  return !(*this == rhs);
+}
+
 bool MeshShardingAttr::operator==(MeshShardingAttr rhs) const {
   if (getMesh() != rhs.getMesh() || getPartialAxes() != rhs.getPartialAxes()) {
     return false;
@@ -311,6 +399,10 @@ bool MeshShardingAttr::operator==(MeshShardingAttr rhs) const {
                       std::mem_fn(&MeshAxesAttr::empty));
 }
 
+bool MeshShardingAttr::operator!=(MeshShardingAttr rhs) const {
+  return !(*this == rhs);
+}
+
 //===----------------------------------------------------------------------===//
 // mesh.shard op
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp b/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp
index dbb9e667d470..54fc91cb2642 100644
--- a/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp
+++ b/mlir/lib/Dialect/Mesh/Interfaces/ShardingInterface.cpp
@@ -13,6 +13,7 @@
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
@@ -388,22 +389,11 @@ FailureOr<ShardingOption> mesh::detail::defaultGetShardingOption(
   return shardingOption;
 }
 
-//===----------------------------------------------------------------------===//
-// detail::defaultAddShardingAnnotations
-//===----------------------------------------------------------------------===//
-
-// To add a `mesh.shard` op for the given result, based on the details provided
-// in `shardingOption`, `map`, and `loopTypes`.
-static LogicalResult addShardOp(OpBuilder &b, OpResult result,
-                                const ShardingOption &shardingOption,
-                                AffineMap map,
-                                ArrayRef<utils::IteratorType> loopTypes,
-                                ArrayRef<ReductionKind> reductionLoopKinds) {
-  FailureOr<std::pair<bool, MeshShardingAttr>> maybeSharding =
-      getMeshShardingAttr(result);
-  if (succeeded(maybeSharding) && !maybeSharding->first)
-    return success();
-
+// Get the sharding attributed for the given result and sharding option.
+MeshShardingAttr
+getShardingAttribute(OpResult result, const ShardingOption &shardingOption,
+                     AffineMap map, ArrayRef<utils::IteratorType> loopTypes,
+                     ArrayRef<ReductionKind> reductionLoopKinds) {
   auto resultType = cast<RankedTensorType>(result.getType());
   SmallVector<SmallVector<MeshAxis>> splitAxes(resultType.getRank());
   SmallVector<MeshAxis> partialAxes;
@@ -438,26 +428,15 @@ static LogicalResult addShardOp(OpBuilder &b, OpResult result,
   }
 
   removeTrailingEmptySubArray(splitAxes);
-  MeshShardingAttr shardAttr = MeshShardingAttr::get(
-      b.getContext(), shardingOption.mesh, splitAxes, partialAxes, partialType);
-  OpBuilder::InsertionGuard guard(b);
-  b.setInsertionPointAfterValue(result);
-  auto shardOp = b.create<ShardOp>(result.getLoc(), resultType, result,
-                                   shardAttr, /*annotate_for_users*/ false);
-  result.replaceAllUsesExcept(shardOp, shardOp);
-  return success();
+  return MeshShardingAttr::get(result.getContext(), shardingOption.mesh,
+                               splitAxes, partialAxes, partialType);
 }
 
-// To add a `mesh.shard` op for the given operand, based on the details provided
-// in `shardingOption`, `map`, and `loopTypes`.
-static LogicalResult addShardOp(OpBuilder &b, OpOperand &opOperand,
-                                const ShardingOption &shardingOption,
-                                AffineMap map) {
-  auto maybeShardingAttr = getMeshShardingAttr(opOperand);
-  if (succeeded(maybeShardingAttr) && maybeShardingAttr->first)
-    return success();
-  Value operand = opOperand.get();
-  auto operandType = cast<RankedTensorType>(operand.getType());
+static FailureOr<MeshShardingAttr>
+getShardingAttribute(OpOperand &opOperand, const ShardingOption &shardingOption,
+                     AffineMap map) {
+  Value operandValue = opOperand.get();
+  auto operandType = cast<RankedTensorType>(operandValue.getType());
   SmallVector<SmallVector<MeshAxis>> splitAxes(operandType.getRank());
   unsigned numDims = map.getNumDims();
   for (auto it : llvm::enumerate(map.getResults())) {
@@ -483,19 +462,79 @@ static LogicalResult addShardOp(OpBuilder &b, OpOperand &opOperand,
   }
 
   removeTrailingEmptySubArray(splitAxes);
-  MeshShardingAttr shardAttr =
-      MeshShardingAttr::get(b.getContext(), shardingOption.mesh, splitAxes);
+  return MeshShardingAttr::get(opOperand.get().getContext(),
+                               shardingOption.mesh, splitAxes);
+}
+
+FailureOr<SmallVector<MeshShardingAttr>>
+mesh::detail::defaultGetShardingAnnotations(
+    Operation *op, const ShardingOption &shardingOption) {
+  SmallVector<MeshShardingAttr> res;
+
+  ShardingInterface shardingOp = llvm::cast<ShardingInterface>(op);
+  SmallVector<utils::IteratorType> loopTypes =
+      shardingOp.getLoopIteratorTypes();
+  SmallVector<ReductionKind> reductionKinds =
+      shardingOp.getReductionLoopIteratorKinds();
+  SmallVector<AffineMap> maps = shardingOp.getIndexingMaps();
+  unsigned numOperands = op->getNumOperands();
+
+  for (OpOperand &opOperand : op->getOpOperands()) {
+    FailureOr<MeshShardingAttr> shardingAttr = getShardingAttribute(
+        opOperand, shardingOption, maps[opOperand.getOperandNumber()]);
+    if (failed(shardingAttr))
+      return failure();
+    res.push_back(*shardingAttr);
+  }
+
+  for (OpResult result : op->getResults()) {
+    res.push_back(getShardingAttribute(
+        result, shardingOption, maps[numOperands + result.getResultNumber()],
+        loopTypes, reductionKinds));
+  }
+
+  return res;
+}
+
+//===----------------------------------------------------------------------===//
+// detail::defaultAddShardingAnnotations
+//===----------------------------------------------------------------------===//
+
+// To add a `mesh.shard` op for the given result, based on the details provided
+// in `shardingOption`, `map`, and `loopTypes`.
+static LogicalResult addShardOp(OpBuilder &b, OpResult result,
+                                const ShardingOption &shardingOption,
+                                AffineMap map,
+                                ArrayRef<utils::IteratorType> loopTypes,
+                                ArrayRef<ReductionKind> reductionLoopKinds) {
+  MeshShardingAttr shardAttr = getShardingAttribute(
+      result, shardingOption, map, loopTypes, reductionLoopKinds);
+  maybeInsertTargetShardingAnnotation(shardAttr, result, b);
+
+  return success();
+}
+
+// To add a `mesh.shard` op for the given operand, based on the details provided
+// in `shardingOption`, `map`, and `loopTypes`.
+static LogicalResult addShardOp(OpBuilder &b, OpOperand &opOperand,
+                                const ShardingOption &shardingOption,
+                                AffineMap map) {
+
+  FailureOr<MeshShardingAttr> shardAttr =
+      getShardingAttribute(opOperand, shardingOption, map);
+  if (failed(shardAttr)) {
+    return failure();
+  }
   OpBuilder::InsertionGuard guard(b);
-  b.setInsertionPoint(opOperand.getOwner());
-  auto shardOp = b.create<ShardOp>(operand.getLoc(), operandType, operand,
-                                   shardAttr, true);
-  opOperand.set(shardOp);
+  maybeInsertSourceShardingAnnotation(*shardAttr, opOperand, b);
 
   return success();
 }
 
 LogicalResult mesh::detail::defaultAddShardingAnnotations(
     Operation *op, OpBuilder &b, const ShardingOption &shardingOption) {
+  assert(!shardingOption.empty && shardingOption.mesh);
+
   ShardingInterface shardingOp = llvm::cast<ShardingInterface>(op);
   SmallVector<utils::IteratorType> loopTypes =
       shardingOp.getLoopIteratorTypes();
diff --git a/mlir/lib/Dialect/Mesh/Transforms/ShardingPropagation.cpp b/mlir/lib/Dialect/Mesh/Transforms/ShardingPropagation.cpp
index 29320f1e339f..870ac4a16808 100644
--- a/mlir/lib/Dialect/Mesh/Transforms/ShardingPropagation.cpp
+++ b/mlir/lib/Dialect/Mesh/Transforms/ShardingPropagation.cpp
@@ -12,9 +12,16 @@
 #include "mlir/Dialect/Mesh/IR/MeshDialect.h"
 #include "mlir/Dialect/Mesh/IR/MeshOps.h"
 #include "mlir/Dialect/Mesh/Interfaces/ShardingInterface.h"
+#include "mlir/IR/Verifier.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 #include <vector>
 
 namespace mlir {
@@ -30,6 +37,70 @@ namespace mesh {
 using namespace mlir;
 using namespace mlir::mesh;
 
+enum class ReshardingRquirementKind {
+  NO_RESHARDING = 0,
+  NO_RESHARDING_FOR_EXPLICIT_ANNOTATIONS,
+  RESHARDING_FOR_EXPLICIT_ANNOTATIONS
+};
+
+#ifdef LLVM_DEBUG
+
+template <typename T>
+static llvm::raw_ostream &operator<<(llvm::raw_ostream &stream,
+                                     const SmallVector<T> &vec);
+template <typename... Ts>
+static llvm::raw_ostream &operator<<(llvm::raw_ostream &stream,
+                                     const std::tuple<Ts...> &t);
+static llvm::raw_ostream &operator<<(llvm::raw_ostream &stream,
+                                     ReshardingRquirementKind v);
+
+template <typename Stream, typename Range>
+static Stream &printRange(Stream &stream, Range &&range) {
+  stream << "[";
+  llvm::for_each(range, [&stream](auto &v) {
+    stream << v;
+    stream << ", ";
+  });
+  return stream << "]";
+}
+
+template <typename T>
+static llvm::raw_ostream &operator<<(llvm::raw_ostream &stream,
+                                     const SmallVector<T> &vec) {
+  return printRange(stream, vec);
+}
+
+[[maybe_unused]] static llvm::raw_ostream &operator<<(llvm::raw_ostream &stream,
+                                                      const ShardingOption &v) {
+  return stream << "{empty = " << v.empty << ", mesh" << v.mesh
+                << ", shardingArray = " << v.shardingArray << "}";
+}
+
+template <typename Stream, typename... Ts, size_t... Is>
+static Stream &printTuple(Stream &stream, std::tuple<Ts...> tuple,
+                          std::index_sequence<Is...>) {
+  static_assert(sizeof...(Is) == sizeof...(Ts),
+                "Indices must have same number of elements as tuple types!");
+  static_assert(sizeof...(Ts) > 0, "Cannot insert empty tuple into stream.");
+
+  stream << "{";
+  ((stream << std::get<Is>(tuple) << ", "), ...);
+  return stream << "}";
+}
+
+template <typename... Ts>
+static llvm::raw_ostream &operator<<(llvm::raw_ostream &stream,
+                                     const std::tuple<Ts...> &t) {
+  return printTuple(stream, t, std::index_sequence_for<Ts...>{});
+}
+
+[[maybe_unused]] static llvm::raw_ostream &
+operator<<(llvm::raw_ostream &stream, ReshardingRquirementKind v) {
+  return stream << static_cast<int>(v);
+}
+
+#endif // LLVM_DEBUG
+
 //===----------------------------------------------------------------------===//
 // Utilities
 //===----------------------------------------------------------------------===//
@@ -77,6 +148,138 @@ getOrderedPossibleShardingAttrs(ArrayRef<MeshShardingAttr> mustShardings,
   return allShardingAttrs;
 }
 
+// The order of preference is form highest to lowest:
+// 1. No resharding is required (all existing annotations are compatible).
+// 2. No resharding for operands/results that have annotation specifically
+//   targeting this operation. This means
+//   * operands that are the result of `mesh.shard` ops marked with
+//     `annotate_for_users`.
+//   * results that are annotated with `mesh.shard` ops without
+//     `annotate_for_users`.
+// 3. All other cases. Resharding is required for operands/results with
+//   annotation targeting explicitly this operation.
+ReshardingRquirementKind getReshardingRquirementKind(
+    Operation *op,
+    const SmallVector<MeshShardingAttr> &operandAndResultShardings) {
+  ReshardingRquirementKind res = ReshardingRquirementKind::NO_RESHARDING;
+
+  size_t operandsCount = op->getOperands().size();
+  auto operandShardings =
+      llvm::make_range(operandAndResultShardings.begin(),
+                       operandAndResultShardings.begin() + operandsCount);
+  auto resultShardings =
+      llvm::make_range(operandAndResultShardings.begin() + operandsCount,
+                       operandAndResultShardings.end());
+
+  for (auto [operand, sharding] :
+       llvm::zip_equal(op->getOperands(), operandShardings)) {
+    ShardOp shardOp = llvm::dyn_cast_or_null<ShardOp>(operand.getDefiningOp());
+    if (!shardOp) {
+      continue;
+    }
+    bool needsResharding = shardOp.getShardAttr() != sharding;
+    bool isExplicitAnnotationForThisOp = shardOp.getAnnotateForUsers();
+    if (needsResharding) {
+      if (isExplicitAnnotationForThisOp) {
+        // This is the worst case. No need to continue.
+        return ReshardingRquirementKind::RESHARDING_FOR_EXPLICIT_ANNOTATIONS;
+      }
+      res = ReshardingRquirementKind::NO_RESHARDING_FOR_EXPLICIT_ANNOTATIONS;
+    }
+  }
+
+  for (auto [result, sharding] :
+       llvm::zip_equal(op->getResults(), resultShardings)) {
+    for (auto user : result.getUsers()) {
+      ShardOp shardOp = llvm::dyn_cast<ShardOp>(user);
+      if (!shardOp) {
+        continue;
+      }
+      bool needsResharding = shardOp.getShardAttr() != sharding;
+      bool isExplicitAnnotationForThisOp = !shardOp.getAnnotateForUsers();
+      if (needsResharding) {
+        if (isExplicitAnnotationForThisOp) {
+          // This is the worst case. No need to continue.
+          return ReshardingRquirementKind::RESHARDING_FOR_EXPLICIT_ANNOTATIONS;
+        }
+        res = ReshardingRquirementKind::NO_RESHARDING_FOR_EXPLICIT_ANNOTATIONS;
+      }
+    }
+  }
+
+  return res;
+}
+
+// From all the operand and result sharding combinations,
+// return the one that is most desirable.
+// The order of preference is:
+// 1. No resharding with respect to existing sharding annotations.
+// 2. Resharding for values that have already annotations that do not target
+//    this op.
+// 3. Resharding of existing explicit sharding annotations for this op.
+static FailureOr<ShardingOption> selectShardingOption(
+    ShardingInterface shardingOp,
+    ArrayRef<SmallVector<MeshShardingAttr>> possibleOperandShardingAttrs,
+    ArrayRef<SmallVector<MeshShardingAttr>> possibleResultShardingAttrs) {
+  SmallVector<std::tuple<ShardingOption, ReshardingRquirementKind>>
+      shardingOptionsAndReshardingRequirements;
+
+  for (ArrayRef<MeshShardingAttr> resultShardings :
+       possibleResultShardingAttrs) {
+    for (ArrayRef<MeshShardingAttr> operandShardings :
+         possibleOperandShardingAttrs) {
+      FailureOr<ShardingOption> shardingOption =
+          shardingOp.getShardingOption(operandShardings, resultShardings);
+      if (failed(shardingOption) || shardingOption->empty) {
+        continue;
+      }
+      // These shardings may not be the same as those in operandShardings and
+      // resultShardings.
+      // They may be missing some annotations.
+      // Whatever is returned by getShardingAnnotations is exactly what the op
+      // needs.
+      FailureOr<SmallVector<MeshShardingAttr>> operandAndResultShardings =
+          shardingOp.getShardingAnnotations(*shardingOption);
+      if (failed(operandAndResultShardings)) {
+        return failure();
+      }
+
+      LLVM_DEBUG(DBGS() << "operandAndResultShardings = "
+                        << *operandAndResultShardings << "\n";);
+
+      ReshardingRquirementKind reshardingRquirement =
+          getReshardingRquirementKind(shardingOp, *operandAndResultShardings);
+      if (reshardingRquirement == ReshardingRquirementKind::NO_RESHARDING) {
+        // This is the best case. No need to go on.
+        return *shardingOption;
+      }
+
+      shardingOptionsAndReshardingRequirements.emplace_back(
+          std::move(*shardingOption), reshardingRquirement);
+    }
+  }
+
+  if (shardingOptionsAndReshardingRequirements.empty()) {
+    return ShardingOption::makeEmpty();
+  }
+
+  std::partial_sort(
+      shardingOptionsAndReshardingRequirements.begin(),
+      shardingOptionsAndReshardingRequirements.begin() + 1,
+      shardingOptionsAndReshardingRequirements.end(),
+      [](const std::tuple<ShardingOption, ReshardingRquirementKind> &a,
+         const std::tuple<ShardingOption, ReshardingRquirementKind> &b) {
+        return std::get<ReshardingRquirementKind>(a) <
+               std::get<ReshardingRquirementKind>(b);
+      });
+
+  LLVM_DEBUG(DBGS() << "shardingOptionsAndReshardingRequirements = "
+                    << shardingOptionsAndReshardingRequirements << "\n";);
+
+  return std::get<ShardingOption>(
+      shardingOptionsAndReshardingRequirements.front());
+}
+
 // For each operation that implements the ShardingInterface, infer the sharding
 // option of the operation from its operands and/or results using the
 // `getShardingOption` method. If the inferred sharding option is not empty, add
@@ -135,32 +338,21 @@ static LogicalResult visitOp(Operation *op, OpBuilder &builder) {
   SmallVector<SmallVector<MeshShardingAttr>> possibleResultShardingAttrs =
       getOrderedPossibleShardingAttrs(resultMustShardings,
                                       allowConflictsResultShardings);
-  FailureOr<ShardingOption> finalShardingOption = failure();
-  for (ArrayRef<MeshShardingAttr> resultShardings :
-       possibleResultShardingAttrs) {
-    if (succeeded(finalShardingOption))
-      break;
-    for (ArrayRef<MeshShardingAttr> operandShardings :
-         possibleOperandShardingAttrs) {
-      FailureOr<ShardingOption> shardingOption =
-          shardingOp.getShardingOption(operandShardings, resultShardings);
-      if (succeeded(shardingOption)) {
-        finalShardingOption = shardingOption;
-        break;
-      }
-    }
-  }
+  FailureOr<ShardingOption> shardingOption = selectShardingOption(
+      shardingOp, possibleOperandShardingAttrs, possibleResultShardingAttrs);
 
-  if (failed(finalShardingOption)) {
+  if (failed(shardingOption)) {
     op->emitOpError() << "fail to get sharding option.";
     return failure();
   }
+
+  LLVM_DEBUG(DBGS() << "Selected sharding option: " << *shardingOption << "\n");
+
   // sharding info is empty, return immediately
-  if (finalShardingOption->empty)
+  if (shardingOption->empty)
     return success();
 
-  if (failed(
-          shardingOp.addShardingAnnotations(builder, *finalShardingOption))) {
+  if (failed(shardingOp.addShardingAnnotations(builder, *shardingOption))) {
     op->emitOpError() << "fail to set sharding annotations.";
     return failure();
   }
@@ -199,6 +391,7 @@ struct ShardingPropagation
 
     LLVM_DEBUG(DBGS() << "After reversed order propagation:\n"
                       << funcOp << "\n");
+    LLVM_DEBUG(assert(succeeded(mlir::verify(funcOp))));
 
     // 2. propagate in original order
     for (Operation &op : llvm::make_early_inc_range(block))
diff --git a/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
index 6b1326d76bc4..f3e4b15aec11 100644
--- a/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
+++ b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp
@@ -493,8 +493,6 @@ TypedValue<ShapedType> reshard(ImplicitLocOpBuilder &builder, MeshOp mesh,
 TypedValue<ShapedType> reshard(OpBuilder &builder, MeshOp mesh, ShardOp source,
                                ShardOp target,
                                TypedValue<ShapedType> sourceShardValue) {
-  assert(!source.getAnnotateForUsers());
-  assert(target.getAnnotateForUsers());
   assert(source.getResult() == target.getOperand());
   ImplicitLocOpBuilder implicitLocOpBuilder(target->getLoc(), builder);
   return reshard(
@@ -628,7 +626,6 @@ spmdizeOperation(ShardOp shardOp, IRMapping &spmdizationMap,
     targetSpmdValue = spmdizationMap.lookup(shardOp.getOperand());
   } else {
     // Insert resharding.
-    assert(!srcShardOp.getAnnotateForUsers() && shardOp.getAnnotateForUsers());
     TypedValue<ShapedType> srcSpmdValue = cast<TypedValue<ShapedType>>(
         spmdizationMap.lookup(srcShardOp.getOperand()));
     targetSpmdValue = reshard(builder, srcShardOp, shardOp, srcSpmdValue,
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 24a6d5b5d684..110873011fe3 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -1789,7 +1789,7 @@ LogicalResult DistributeOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
-// ReductionOp
+// DeclareReductionOp
 //===----------------------------------------------------------------------===//
 
 static ParseResult parseAtomicReductionRegion(OpAsmParser &parser,
@@ -1881,21 +1881,6 @@ LogicalResult DeclareReductionOp::verifyRegions() {
   return success();
 }
 
-LogicalResult ReductionOp::verify() {
-  auto *op = (*this)->getParentWithTrait<ReductionClauseInterface::Trait>();
-  if (!op)
-    return emitOpError() << "must be used within an operation supporting "
-                            "reduction clause interface";
-  while (op) {
-    for (const auto &var :
-         cast<ReductionClauseInterface>(op).getAllReductionVars())
-      if (var == getAccumulator())
-        return success();
-    op = op->getParentWithTrait<ReductionClauseInterface::Trait>();
-  }
-  return emitOpError() << "the accumulator is not used by the parent";
-}
-
 //===----------------------------------------------------------------------===//
 // TaskOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp b/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp
index 890ce5226c30..cc7d3172b1a1 100644
--- a/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp
+++ b/mlir/lib/Dialect/Polynomial/IR/PolynomialAttributes.cpp
@@ -101,7 +101,7 @@ parseMonomial(AsmParser &parser, Monomial &monomial, llvm::StringRef &variable,
   return success();
 }
 
-template <typename PolynoimalAttrTy, typename Monomial>
+template <typename Monomial>
 LogicalResult
 parsePolynomialAttr(AsmParser &parser, llvm::SmallVector<Monomial> &monomials,
                     llvm::StringSet<> &variables,
@@ -155,7 +155,7 @@ Attribute IntPolynomialAttr::parse(AsmParser &parser, Type type) {
   llvm::SmallVector<IntMonomial> monomials;
   llvm::StringSet<> variables;
 
-  if (failed(parsePolynomialAttr<IntPolynomialAttr, IntMonomial>(
+  if (failed(parsePolynomialAttr<IntMonomial>(
           parser, monomials, variables,
           [&](IntMonomial &monomial) -> OptionalParseResult {
             APInt parsedCoeff(apintBitWidth, 1);
@@ -175,7 +175,6 @@ Attribute IntPolynomialAttr::parse(AsmParser &parser, Type type) {
   }
   return IntPolynomialAttr::get(parser.getContext(), result.value());
 }
-
 Attribute FloatPolynomialAttr::parse(AsmParser &parser, Type type) {
   if (failed(parser.parseLess()))
     return {};
@@ -191,8 +190,8 @@ Attribute FloatPolynomialAttr::parse(AsmParser &parser, Type type) {
     return OptionalParseResult(result);
   };
 
-  if (failed(parsePolynomialAttr<FloatPolynomialAttr, FloatMonomial>(
-          parser, monomials, variables, parseAndStoreCoefficient))) {
+  if (failed(parsePolynomialAttr<FloatMonomial>(parser, monomials, variables,
+                                                parseAndStoreCoefficient))) {
     return {};
   }
 
diff --git a/mlir/lib/Dialect/Polynomial/IR/PolynomialCanonicalization.td b/mlir/lib/Dialect/Polynomial/IR/PolynomialCanonicalization.td
index 9d09799c1763..e37bcf76a20f 100644
--- a/mlir/lib/Dialect/Polynomial/IR/PolynomialCanonicalization.td
+++ b/mlir/lib/Dialect/Polynomial/IR/PolynomialCanonicalization.td
@@ -9,11 +9,14 @@
 #ifndef POLYNOMIAL_CANONICALIZATION
 #define POLYNOMIAL_CANONICALIZATION
 
-include "mlir/Dialect/Polynomial/IR/Polynomial.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
+include "mlir/Dialect/Polynomial/IR/Polynomial.td"
+include "mlir/IR/EnumAttr.td"
 include "mlir/IR/OpBase.td"
 include "mlir/IR/PatternBase.td"
 
+defvar DefOverflow = ConstantEnumCase<Arith_IntegerOverflowAttr, "none">;
+
 // Get a -1 integer attribute of the same type as the polynomial SSA value's
 // ring coefficient type.
 def getMinusOne
@@ -39,4 +42,40 @@ def NTTAfterINTT : Pat<
   []
 >;
 
+// NTTs are expensive, and addition in coefficient or NTT domain should be
+// equivalently expensive, so reducing the number of NTTs is optimal.
+// ntt(a) + ntt(b) -> ntt(a + b)
+def NTTOfAdd : Pat<
+  (Arith_AddIOp
+    (Polynomial_NTTOp $p1),
+    (Polynomial_NTTOp $p2),
+    $overflow),
+  (Polynomial_NTTOp (Polynomial_AddOp $p1, $p2)),
+  []
+>;
+// intt(a) + intt(b) -> intt(a + b)
+def INTTOfAdd : Pat<
+  (Polynomial_AddOp
+    (Polynomial_INTTOp $t1),
+    (Polynomial_INTTOp $t2)),
+  (Polynomial_INTTOp (Arith_AddIOp $t1, $t2, DefOverflow)),
+  []
+>;
+// repeated for sub
+def NTTOfSub : Pat<
+  (Arith_SubIOp
+    (Polynomial_NTTOp $p1),
+    (Polynomial_NTTOp $p2),
+    $overflow),
+  (Polynomial_NTTOp (Polynomial_SubOp $p1, $p2)),
+  []
+>;
+def INTTOfSub : Pat<
+  (Polynomial_SubOp
+    (Polynomial_INTTOp $t1),
+    (Polynomial_INTTOp $t2)),
+  (Polynomial_INTTOp (Arith_SubIOp $t1, $t2, DefOverflow)),
+  []
+>;
+
 #endif  // POLYNOMIAL_CANONICALIZATION
diff --git a/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp b/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp
index 1a2439fe810b..3d302797ce51 100644
--- a/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp
+++ b/mlir/lib/Dialect/Polynomial/IR/PolynomialOps.cpp
@@ -186,6 +186,88 @@ LogicalResult INTTOp::verify() {
   return verifyNTTOp(this->getOperation(), ring, tensorType);
 }
 
+ParseResult ConstantOp::parse(OpAsmParser &parser, OperationState &result) {
+  // Using the built-in parser.parseAttribute requires the full
+  // #polynomial.typed_int_polynomial syntax, which is excessive.
+  // Instead we parse a keyword int to signal it's an integer polynomial
+  Type type;
+  if (succeeded(parser.parseOptionalKeyword("float"))) {
+    Attribute floatPolyAttr = FloatPolynomialAttr::parse(parser, nullptr);
+    if (floatPolyAttr) {
+      if (parser.parseColon() || parser.parseType(type))
+        return failure();
+      result.addAttribute("value",
+                          TypedFloatPolynomialAttr::get(type, floatPolyAttr));
+      result.addTypes(type);
+      return success();
+    }
+  }
+
+  if (succeeded(parser.parseOptionalKeyword("int"))) {
+    Attribute intPolyAttr = IntPolynomialAttr::parse(parser, nullptr);
+    if (intPolyAttr) {
+      if (parser.parseColon() || parser.parseType(type))
+        return failure();
+
+      result.addAttribute("value",
+                          TypedIntPolynomialAttr::get(type, intPolyAttr));
+      result.addTypes(type);
+      return success();
+    }
+  }
+
+  // In the worst case, still accept the verbose versions.
+  TypedIntPolynomialAttr typedIntPolyAttr;
+  OptionalParseResult res =
+      parser.parseOptionalAttribute<TypedIntPolynomialAttr>(
+          typedIntPolyAttr, "value", result.attributes);
+  if (res.has_value() && succeeded(res.value())) {
+    result.addTypes(typedIntPolyAttr.getType());
+    return success();
+  }
+
+  TypedFloatPolynomialAttr typedFloatPolyAttr;
+  res = parser.parseAttribute<TypedFloatPolynomialAttr>(
+      typedFloatPolyAttr, "value", result.attributes);
+  if (res.has_value() && succeeded(res.value())) {
+    result.addTypes(typedFloatPolyAttr.getType());
+    return success();
+  }
+
+  return failure();
+}
+
+void ConstantOp::print(OpAsmPrinter &p) {
+  p << " ";
+  if (auto intPoly = dyn_cast<TypedIntPolynomialAttr>(getValue())) {
+    p << "int";
+    intPoly.getValue().print(p);
+  } else if (auto floatPoly = dyn_cast<TypedFloatPolynomialAttr>(getValue())) {
+    p << "float";
+    floatPoly.getValue().print(p);
+  } else {
+    assert(false && "unexpected attribute type");
+  }
+  p << " : ";
+  p.printType(getOutput().getType());
+}
+
+LogicalResult ConstantOp::inferReturnTypes(
+    MLIRContext *context, std::optional<mlir::Location> location,
+    ConstantOp::Adaptor adaptor,
+    llvm::SmallVectorImpl<mlir::Type> &inferredReturnTypes) {
+  Attribute operand = adaptor.getValue();
+  if (auto intPoly = dyn_cast<TypedIntPolynomialAttr>(operand)) {
+    inferredReturnTypes.push_back(intPoly.getType());
+  } else if (auto floatPoly = dyn_cast<TypedFloatPolynomialAttr>(operand)) {
+    inferredReturnTypes.push_back(floatPoly.getType());
+  } else {
+    assert(false && "unexpected attribute type");
+    return failure();
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // TableGen'd canonicalization patterns
 //===----------------------------------------------------------------------===//
@@ -201,10 +283,10 @@ void SubOp::getCanonicalizationPatterns(RewritePatternSet &results,
 
 void NTTOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                         MLIRContext *context) {
-  results.add<NTTAfterINTT>(context);
+  results.add<NTTAfterINTT, NTTOfAdd, NTTOfSub>(context);
 }
 
 void INTTOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                          MLIRContext *context) {
-  results.add<INTTAfterNTT>(context);
+  results.add<INTTAfterNTT, INTTOfAdd, INTTOfSub>(context);
 }
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index 1a84a59ddb69..a72dafe72517 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -182,6 +182,9 @@ static LogicalResult generateLoopNestUsingForOp(
   if (loops.empty())
     return success();
 
+  assert(tiledResults.size() == destinationTensors.size() &&
+         "Number of results of body should be equal to number of iter args");
+
   // 6. Yield all the results of the tiled operation.
   SmallVector<Value> yieldedValues;
   for (auto [tiledValue, destinationTensor, resultOffset, resultSize] :
@@ -694,9 +697,6 @@ mlir::scf::tileReductionUsingScf(RewriterBase &b,
     tileSizesVector.append(iterationDomain.size() - tileSizesVector.size(),
                            zero);
   }
-  if (op->getNumResults() != 1)
-    return b.notifyMatchFailure(
-        op, "don't support ops with multiple results for now");
   SmallVector<utils::IteratorType> iterators =
       tilingInterfaceOp.getLoopIteratorTypes();
 
@@ -708,12 +708,13 @@ mlir::scf::tileReductionUsingScf(RewriterBase &b,
   }
 
   // 2. create the inital tensor value.
-  FailureOr<Operation *> identityTensor =
+  FailureOr<SmallVector<Value>> maybeInitTensors =
       op.generateInitialTensorForPartialReduction(b, loc, tileSizesVector,
                                                   reductionDims);
-  if (failed(identityTensor))
-    return b.notifyMatchFailure(op,
-                                "cannot create a tensor of identity value.");
+  if (failed(maybeInitTensors)) {
+    return b.notifyMatchFailure(op, "Failed to create initial tensors.");
+  }
+  SmallVector<Value> &initTensors = maybeInitTensors.value();
 
   // 3. Define the callback to use for generating the inner most tile loop body.
   Operation *parallelOp = nullptr;
@@ -753,29 +754,26 @@ mlir::scf::tileReductionUsingScf(RewriterBase &b,
     tiledResult.append(parallelOp->result_begin(), parallelOp->result_end());
     // 4d. Compute the offsets and sizes needed to insert the result of the
     // tiled value back into destination before yielding the destination.
-    SmallVector<OpFoldResult> outOffsets(offsets.size(), b.getIndexAttr(0));
-    resultOffsets.emplace_back(std::move(outOffsets));
-
-    SmallVector<OpFoldResult> outSizes;
-    for (size_t i = 0; i < offsets.size(); i++) {
-      outSizes.push_back(
-          tensor::getMixedSize(b, loc, parallelOp->getResult(0), i));
+    for (int resultIdx : llvm::seq<int>(0, parallelOp->getNumResults())) {
+      SmallVector<OpFoldResult> outOffsets(offsets.size(), b.getIndexAttr(0));
+      resultOffsets.emplace_back(std::move(outOffsets));
+
+      SmallVector<OpFoldResult> outSizes;
+      for (size_t i = 0; i < offsets.size(); i++) {
+        outSizes.push_back(
+            tensor::getMixedSize(b, loc, parallelOp->getResult(resultIdx), i));
+      }
+      resultSizes.emplace_back(std::move(outSizes));
     }
-    resultSizes.emplace_back(std::move(outSizes));
     return success();
   };
 
   // 5. Generate the tiled implementation using the destination tensors.
-  SmallVector<Value> destinationTensors =
-      llvm::map_to_vector(identityTensor.value()->getResults(),
-                          [](OpResult res) -> Value { return res; });
-
   SmallVector<LoopLikeOpInterface> loops;
   scf::SCFTilingOptions options;
   options.setLoopType(scf::SCFTilingOptions::LoopType::ForOp);
   if (failed(generateLoopNest(b, loc, options, iterationDomain, tileSizesVector,
-                              destinationTensors, innerYieldTiledValuesFn,
-                              loops)))
+                              initTensors, innerYieldTiledValuesFn, loops)))
     return b.notifyMatchFailure(op, "failed to tile for parallel reduction");
 
   SmallVector<Value> replacements = llvm::map_to_vector(
@@ -787,7 +785,7 @@ mlir::scf::tileReductionUsingScf(RewriterBase &b,
   b.replaceOp(op, mergeOp->getResults());
 
   SCFReductionTilingResult results;
-  results.initialOp = *identityTensor;
+  results.initialValues = initTensors;
   results.loops = loops;
   results.parallelTiledOp = parallelOp;
   results.mergeOp = mergeOp;
diff --git a/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp
index 7a707e749e69..43ad0acaf742 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/EmptyOpPatterns.cpp
@@ -93,6 +93,49 @@ private:
   bool foldSingleUseOnly = false;
 };
 
+/// tensor.empty does not define any tensor contents, so an unpadded pack
+/// can be folded away.
+struct FoldEmptyTensorWithPackOp : public OpRewritePattern<PackOp> {
+  using OpRewritePattern<PackOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(PackOp packOp,
+                                PatternRewriter &rewriter) const override {
+    // Check for tensor.empty source.
+    auto emptyOp = packOp.getSource().getDefiningOp<EmptyOp>();
+    if (!emptyOp)
+      return failure();
+
+    // Check for padding.
+    // Packing with padding cannot be simply removed.
+    if (packOp.getPaddingValue())
+      return rewriter.notifyMatchFailure(packOp, "expects no padding value");
+
+    // Replace the pack directly with its destination.
+    rewriter.replaceOp(packOp, packOp.getDest());
+
+    return success();
+  }
+};
+
+/// tensor.empty does not define any tensor contents, so an unpack
+/// can be folded away.
+struct FoldEmptyTensorWithUnPackOp : public OpRewritePattern<UnPackOp> {
+  using OpRewritePattern<UnPackOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(UnPackOp unPackOp,
+                                PatternRewriter &rewriter) const override {
+    // Check for tensor.empty source.
+    auto emptyOp = unPackOp.getSource().getDefiningOp<EmptyOp>();
+    if (!emptyOp)
+      return failure();
+
+    // Replace the unpack directly with its destination.
+    rewriter.replaceOp(unPackOp, unPackOp.getDest());
+
+    return success();
+  }
+};
+
 } // namespace
 
 void mlir::tensor::populateFoldTensorEmptyPatterns(RewritePatternSet &patterns,
@@ -101,4 +144,6 @@ void mlir::tensor::populateFoldTensorEmptyPatterns(RewritePatternSet &patterns,
                FoldEmptyTensorWithReshapeOp<tensor::ExpandShapeOp>,
                FoldEmptyTensorWithReshapeOp<tensor::CollapseShapeOp>>(
       patterns.getContext(), /*benefit=*/1, foldSingleUseOnly);
+  patterns.add<FoldEmptyTensorWithPackOp, FoldEmptyTensorWithUnPackOp>(
+      patterns.getContext(), /*benefit=*/1);
 }
diff --git a/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp
index ebcb34e9ef02..5d6e3ec9756a 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp
@@ -91,7 +91,8 @@ struct SimplifyPackToExpandShape : public OpRewritePattern<PackOp> {
     RankedTensorType sourceType = packOp.getSourceType();
     if (failed(isPackOnInnerMostDim(rewriter, packOp)) &&
         failed(isPackOn1D(rewriter, packOp, sourceType.getShape(),
-                          packOp.getStaticTiles()))) {
+                          packOp.getStaticTiles())) &&
+        !packOp.isLikePad()) {
       return failure();
     }
 
@@ -152,7 +153,8 @@ struct SimplifyUnPackToCollapseShape : public OpRewritePattern<UnPackOp> {
     RankedTensorType destType = unpackOp.getDestType();
     if (failed(isUnpackOnInnerMostDim(rewriter, unpackOp)) &&
         failed(isPackOn1D(rewriter, unpackOp, destType.getShape(),
-                          unpackOp.getStaticTiles()))) {
+                          unpackOp.getStaticTiles())) &&
+        !unpackOp.isLikeUnPad()) {
       return failure();
     }
 
diff --git a/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp
index d40e5f33d2a7..6cf0f845f59d 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp
@@ -79,12 +79,42 @@ struct FoldInsertOfRankReducingInsert : public OpRewritePattern<OpTy> {
     return success();
   }
 };
+
+/// Fold expand_shape which only adds static dimensions of size `1`
+/// into insert_slice.
+template <typename OpTy>
+struct FoldPaddingExpandIntoInsert : public OpRewritePattern<OpTy> {
+  using OpRewritePattern<OpTy>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(OpTy insertSliceOp,
+                                PatternRewriter &rewriter) const override {
+    auto expandShapeOp = insertSliceOp.getSource()
+                             .template getDefiningOp<tensor::ExpandShapeOp>();
+    if (!expandShapeOp)
+      return failure();
+
+    // Only fold away simple expansion where all added dimensions have static
+    // size `1`.
+    SliceVerificationResult res = isRankReducedType(
+        expandShapeOp.getResultType(), expandShapeOp.getSrcType());
+    if (res != SliceVerificationResult::Success)
+      return rewriter.notifyMatchFailure(insertSliceOp,
+                                         "expected rank increasing expansion");
+
+    rewriter.modifyOpInPlace(insertSliceOp, [&]() {
+      insertSliceOp.getSourceMutable().assign(expandShapeOp.getSrc());
+    });
+    return success();
+  }
+};
 } // namespace
 
 void mlir::tensor::populateReassociativeReshapeFoldingPatterns(
     RewritePatternSet &patterns) {
   patterns.add<FoldExpandOfRankReducingExtract,
                FoldInsertOfRankReducingInsert<tensor::InsertSliceOp>,
-               FoldInsertOfRankReducingInsert<tensor::ParallelInsertSliceOp>>(
+               FoldInsertOfRankReducingInsert<tensor::ParallelInsertSliceOp>,
+               FoldPaddingExpandIntoInsert<tensor::InsertSliceOp>,
+               FoldPaddingExpandIntoInsert<tensor::ParallelInsertSliceOp>>(
       patterns.getContext());
 }
diff --git a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
index 6af229cae10a..fe1a67d62873 100644
--- a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
+++ b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp
@@ -178,18 +178,24 @@ ConstantIntRanges mlir::intrange::truncRange(const ConstantIntRanges &range,
 //===----------------------------------------------------------------------===//
 
 ConstantIntRanges
-mlir::intrange::inferAdd(ArrayRef<ConstantIntRanges> argRanges) {
+mlir::intrange::inferAdd(ArrayRef<ConstantIntRanges> argRanges,
+                         OverflowFlags ovfFlags) {
   const ConstantIntRanges &lhs = argRanges[0], &rhs = argRanges[1];
-  ConstArithFn uadd = [](const APInt &a,
-                         const APInt &b) -> std::optional<APInt> {
+
+  std::function uadd = [=](const APInt &a,
+                           const APInt &b) -> std::optional<APInt> {
     bool overflowed = false;
-    APInt result = a.uadd_ov(b, overflowed);
+    APInt result = any(ovfFlags & OverflowFlags::Nuw)
+                       ? a.uadd_sat(b)
+                       : a.uadd_ov(b, overflowed);
     return overflowed ? std::optional<APInt>() : result;
   };
-  ConstArithFn sadd = [](const APInt &a,
-                         const APInt &b) -> std::optional<APInt> {
+  std::function sadd = [=](const APInt &a,
+                           const APInt &b) -> std::optional<APInt> {
     bool overflowed = false;
-    APInt result = a.sadd_ov(b, overflowed);
+    APInt result = any(ovfFlags & OverflowFlags::Nsw)
+                       ? a.sadd_sat(b)
+                       : a.sadd_ov(b, overflowed);
     return overflowed ? std::optional<APInt>() : result;
   };
 
@@ -205,19 +211,24 @@ mlir::intrange::inferAdd(ArrayRef<ConstantIntRanges> argRanges) {
 //===----------------------------------------------------------------------===//
 
 ConstantIntRanges
-mlir::intrange::inferSub(ArrayRef<ConstantIntRanges> argRanges) {
+mlir::intrange::inferSub(ArrayRef<ConstantIntRanges> argRanges,
+                         OverflowFlags ovfFlags) {
   const ConstantIntRanges &lhs = argRanges[0], &rhs = argRanges[1];
 
-  ConstArithFn usub = [](const APInt &a,
-                         const APInt &b) -> std::optional<APInt> {
+  std::function usub = [=](const APInt &a,
+                           const APInt &b) -> std::optional<APInt> {
     bool overflowed = false;
-    APInt result = a.usub_ov(b, overflowed);
+    APInt result = any(ovfFlags & OverflowFlags::Nuw)
+                       ? a.usub_sat(b)
+                       : a.usub_ov(b, overflowed);
     return overflowed ? std::optional<APInt>() : result;
   };
-  ConstArithFn ssub = [](const APInt &a,
-                         const APInt &b) -> std::optional<APInt> {
+  std::function ssub = [=](const APInt &a,
+                           const APInt &b) -> std::optional<APInt> {
     bool overflowed = false;
-    APInt result = a.ssub_ov(b, overflowed);
+    APInt result = any(ovfFlags & OverflowFlags::Nsw)
+                       ? a.ssub_sat(b)
+                       : a.ssub_ov(b, overflowed);
     return overflowed ? std::optional<APInt>() : result;
   };
   ConstantIntRanges urange = computeBoundsBy(
@@ -232,19 +243,24 @@ mlir::intrange::inferSub(ArrayRef<ConstantIntRanges> argRanges) {
 //===----------------------------------------------------------------------===//
 
 ConstantIntRanges
-mlir::intrange::inferMul(ArrayRef<ConstantIntRanges> argRanges) {
+mlir::intrange::inferMul(ArrayRef<ConstantIntRanges> argRanges,
+                         OverflowFlags ovfFlags) {
   const ConstantIntRanges &lhs = argRanges[0], &rhs = argRanges[1];
 
-  ConstArithFn umul = [](const APInt &a,
-                         const APInt &b) -> std::optional<APInt> {
+  std::function umul = [=](const APInt &a,
+                           const APInt &b) -> std::optional<APInt> {
     bool overflowed = false;
-    APInt result = a.umul_ov(b, overflowed);
+    APInt result = any(ovfFlags & OverflowFlags::Nuw)
+                       ? a.umul_sat(b)
+                       : a.umul_ov(b, overflowed);
     return overflowed ? std::optional<APInt>() : result;
   };
-  ConstArithFn smul = [](const APInt &a,
-                         const APInt &b) -> std::optional<APInt> {
+  std::function smul = [=](const APInt &a,
+                           const APInt &b) -> std::optional<APInt> {
     bool overflowed = false;
-    APInt result = a.smul_ov(b, overflowed);
+    APInt result = any(ovfFlags & OverflowFlags::Nsw)
+                       ? a.smul_sat(b)
+                       : a.smul_ov(b, overflowed);
     return overflowed ? std::optional<APInt>() : result;
   };
 
@@ -542,32 +558,35 @@ mlir::intrange::inferXor(ArrayRef<ConstantIntRanges> argRanges) {
 //===----------------------------------------------------------------------===//
 
 ConstantIntRanges
-mlir::intrange::inferShl(ArrayRef<ConstantIntRanges> argRanges) {
+mlir::intrange::inferShl(ArrayRef<ConstantIntRanges> argRanges,
+                         OverflowFlags ovfFlags) {
   const ConstantIntRanges &lhs = argRanges[0], &rhs = argRanges[1];
-  const APInt &lhsSMin = lhs.smin(), &lhsSMax = lhs.smax(),
-              &lhsUMax = lhs.umax(), &rhsUMin = rhs.umin(),
-              &rhsUMax = rhs.umax();
+  const APInt &rhsUMin = rhs.umin(), &rhsUMax = rhs.umax();
 
-  ConstArithFn shl = [](const APInt &l,
-                        const APInt &r) -> std::optional<APInt> {
-    return r.uge(r.getBitWidth()) ? std::optional<APInt>() : l.shl(r);
+  // The signed/unsigned overflow behavior of shl by `rhs` matches a mul with
+  // 2^rhs.
+  std::function ushl = [=](const APInt &l,
+                           const APInt &r) -> std::optional<APInt> {
+    bool overflowed = false;
+    APInt result = any(ovfFlags & OverflowFlags::Nuw)
+                       ? l.ushl_sat(r)
+                       : l.ushl_ov(r, overflowed);
+    return overflowed ? std::optional<APInt>() : result;
+  };
+  std::function sshl = [=](const APInt &l,
+                           const APInt &r) -> std::optional<APInt> {
+    bool overflowed = false;
+    APInt result = any(ovfFlags & OverflowFlags::Nsw)
+                       ? l.sshl_sat(r)
+                       : l.sshl_ov(r, overflowed);
+    return overflowed ? std::optional<APInt>() : result;
   };
-
-  // The minMax inference does not work when there is danger of overflow. In the
-  // signed case, this leads to the obvious problem that the sign bit might
-  // change. In the unsigned case, it also leads to problems because the largest
-  // LHS shifted by the largest RHS does not necessarily result in the largest
-  // result anymore.
-  assert(rhsUMax.isNonNegative() && "Unexpected negative shift count");
-  if (rhsUMax.uge(lhsSMin.getNumSignBits()) ||
-      rhsUMax.uge(lhsSMax.getNumSignBits()))
-    return ConstantIntRanges::maxRange(lhsUMax.getBitWidth());
 
   ConstantIntRanges urange =
-      minMaxBy(shl, {lhs.umin(), lhsUMax}, {rhsUMin, rhsUMax},
+      minMaxBy(ushl, {lhs.umin(), lhs.umax()}, {rhsUMin, rhsUMax},
                /*isSigned=*/false);
   ConstantIntRanges srange =
-      minMaxBy(shl, {lhsSMin, lhsSMax}, {rhsUMin, rhsUMax},
+      minMaxBy(sshl, {lhs.smin(), lhs.smax()}, {rhsUMin, rhsUMax},
                /*isSigned=*/true);
   return urange.intersection(srange);
 }
diff --git a/mlir/lib/Pass/IRPrinting.cpp b/mlir/lib/Pass/IRPrinting.cpp
index 72b94eeb0123..a12bdd935a48 100644
--- a/mlir/lib/Pass/IRPrinting.cpp
+++ b/mlir/lib/Pass/IRPrinting.cpp
@@ -9,8 +9,12 @@
 #include "PassDetail.h"
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/Pass/PassManager.h"
-#include "llvm/Support/Format.h"
+#include "mlir/Support/FileUtilities.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/ToolOutputFile.h"
 
 using namespace mlir;
 using namespace mlir::detail;
@@ -200,6 +204,149 @@ struct BasicIRPrinterConfig : public PassManager::IRPrinterConfig {
 };
 } // namespace
 
+/// Return pairs of (sanitized op name, symbol name) for `op` and all parent
+/// operations. Op names are sanitized by replacing periods with underscores.
+/// The pairs are returned in order of outer-most to inner-most (ancestors of
+/// `op` first, `op` last). This information is used to construct the directory
+/// tree for the `FileTreeIRPrinterConfig` below.
+/// The counter for `op` will be incremented by this call.
+static std::pair<SmallVector<std::pair<std::string, StringRef>>, std::string>
+getOpAndSymbolNames(Operation *op, StringRef passName,
+                    llvm::DenseMap<Operation *, unsigned> &counters) {
+  SmallVector<std::pair<std::string, StringRef>> pathElements;
+  SmallVector<unsigned> countPrefix;
+
+  if (!counters.contains(op))
+    counters[op] = -1;
+
+  Operation *iter = op;
+  ++counters[op];
+  while (iter) {
+    countPrefix.push_back(counters[iter]);
+    StringAttr symbolName =
+        iter->getAttrOfType<StringAttr>(SymbolTable::getSymbolAttrName());
+    std::string opName =
+        llvm::join(llvm::split(iter->getName().getStringRef().str(), '.'), "_");
+    pathElements.emplace_back(opName, symbolName ? symbolName.strref()
+                                                 : "no-symbol-name");
+    iter = iter->getParentOp();
+  }
+  // Return in the order of top level (module) down to `op`.
+  std::reverse(countPrefix.begin(), countPrefix.end());
+  std::reverse(pathElements.begin(), pathElements.end());
+
+  std::string passFileName = llvm::formatv(
+      "{0:$[_]}_{1}.mlir",
+      llvm::make_range(countPrefix.begin(), countPrefix.end()), passName);
+
+  return {pathElements, passFileName};
+}
+
+static LogicalResult createDirectoryOrPrintErr(llvm::StringRef dirPath) {
+  if (std::error_code ec =
+          llvm::sys::fs::create_directory(dirPath, /*IgnoreExisting=*/true)) {
+    llvm::errs() << "Error while creating directory " << dirPath << ": "
+                 << ec.message() << "\n";
+    return failure();
+  }
+  return success();
+}
+
+/// Creates  directories (if required) and opens an output file for the
+/// FileTreeIRPrinterConfig.
+static std::unique_ptr<llvm::ToolOutputFile>
+createTreePrinterOutputPath(Operation *op, llvm::StringRef passArgument,
+                            llvm::StringRef rootDir,
+                            llvm::DenseMap<Operation *, unsigned> &counters) {
+  // Create the path. We will create a tree rooted at the given 'rootDir'
+  // directory. The root directory will contain folders with the names of
+  // modules. Sub-directories within those folders mirror the nesting
+  // structure of the pass manager, using symbol names for directory names.
+  auto [opAndSymbolNames, fileName] =
+      getOpAndSymbolNames(op, passArgument, counters);
+
+  // Create all the directories, starting at the root. Abort early if we fail to
+  // create any directory.
+  llvm::SmallString<128> path(rootDir);
+  if (failed(createDirectoryOrPrintErr(path)))
+    return nullptr;
+
+  for (auto [opName, symbolName] : opAndSymbolNames) {
+    llvm::sys::path::append(path, opName + "_" + symbolName);
+    if (failed(createDirectoryOrPrintErr(path)))
+      return nullptr;
+  }
+
+  // Open output file.
+  llvm::sys::path::append(path, fileName);
+  std::string error;
+  std::unique_ptr<llvm::ToolOutputFile> file = openOutputFile(path, &error);
+  if (!file) {
+    llvm::errs() << "Error opening output file " << path << ": " << error
+                 << "\n";
+    return nullptr;
+  }
+  return file;
+}
+
+namespace {
+/// A configuration that prints the IR before/after each pass to a set of files
+/// in the specified directory. The files are organized into subdirectories that
+/// mirror the nesting structure of the IR.
+struct FileTreeIRPrinterConfig : public PassManager::IRPrinterConfig {
+  FileTreeIRPrinterConfig(
+      std::function<bool(Pass *, Operation *)> shouldPrintBeforePass,
+      std::function<bool(Pass *, Operation *)> shouldPrintAfterPass,
+      bool printModuleScope, bool printAfterOnlyOnChange,
+      bool printAfterOnlyOnFailure, OpPrintingFlags opPrintingFlags,
+      llvm::StringRef treeDir)
+      : IRPrinterConfig(printModuleScope, printAfterOnlyOnChange,
+                        printAfterOnlyOnFailure, opPrintingFlags),
+        shouldPrintBeforePass(std::move(shouldPrintBeforePass)),
+        shouldPrintAfterPass(std::move(shouldPrintAfterPass)),
+        treeDir(treeDir) {
+    assert((this->shouldPrintBeforePass || this->shouldPrintAfterPass) &&
+           "expected at least one valid filter function");
+  }
+
+  void printBeforeIfEnabled(Pass *pass, Operation *operation,
+                            PrintCallbackFn printCallback) final {
+    if (!shouldPrintBeforePass || !shouldPrintBeforePass(pass, operation))
+      return;
+    std::unique_ptr<llvm::ToolOutputFile> file = createTreePrinterOutputPath(
+        operation, pass->getArgument(), treeDir, counters);
+    if (!file)
+      return;
+    printCallback(file->os());
+    file->keep();
+  }
+
+  void printAfterIfEnabled(Pass *pass, Operation *operation,
+                           PrintCallbackFn printCallback) final {
+    if (!shouldPrintAfterPass || !shouldPrintAfterPass(pass, operation))
+      return;
+    std::unique_ptr<llvm::ToolOutputFile> file = createTreePrinterOutputPath(
+        operation, pass->getArgument(), treeDir, counters);
+    if (!file)
+      return;
+    printCallback(file->os());
+    file->keep();
+  }
+
+  /// Filter functions for before and after pass execution.
+  std::function<bool(Pass *, Operation *)> shouldPrintBeforePass;
+  std::function<bool(Pass *, Operation *)> shouldPrintAfterPass;
+
+  /// Directory that should be used as the root of the file tree.
+  std::string treeDir;
+
+  /// Counters used for labeling the prefix. Every op which could be targeted by
+  /// a pass gets its own counter.
+  llvm::DenseMap<Operation *, unsigned> counters;
+};
+
+} // namespace
+
 /// Add an instrumentation to print the IR before and after pass execution,
 /// using the provided configuration.
 void PassManager::enableIRPrinting(std::unique_ptr<IRPrinterConfig> config) {
@@ -223,3 +370,16 @@ void PassManager::enableIRPrinting(
       printModuleScope, printAfterOnlyOnChange, printAfterOnlyOnFailure,
       opPrintingFlags, out));
 }
+
+/// Add an instrumentation to print the IR before and after pass execution.
+void PassManager::enableIRPrintingToFileTree(
+    std::function<bool(Pass *, Operation *)> shouldPrintBeforePass,
+    std::function<bool(Pass *, Operation *)> shouldPrintAfterPass,
+    bool printModuleScope, bool printAfterOnlyOnChange,
+    bool printAfterOnlyOnFailure, StringRef printTreeDir,
+    OpPrintingFlags opPrintingFlags) {
+  enableIRPrinting(std::make_unique<FileTreeIRPrinterConfig>(
+      std::move(shouldPrintBeforePass), std::move(shouldPrintAfterPass),
+      printModuleScope, printAfterOnlyOnChange, printAfterOnlyOnFailure,
+      opPrintingFlags, printTreeDir));
+}
diff --git a/mlir/lib/Pass/PassManagerOptions.cpp b/mlir/lib/Pass/PassManagerOptions.cpp
index ffc53b7e3ed0..706a21a23ee3 100644
--- a/mlir/lib/Pass/PassManagerOptions.cpp
+++ b/mlir/lib/Pass/PassManagerOptions.cpp
@@ -58,6 +58,10 @@ struct PassManagerOptions {
       llvm::cl::desc("When printing IR for print-ir-[before|after]{-all} "
                      "always print the top-level operation"),
       llvm::cl::init(false)};
+  llvm::cl::opt<std::string> printTreeDir{
+      "mlir-print-ir-tree-dir",
+      llvm::cl::desc("When printing the IR before/after a pass, print file "
+                     "tree rooted at this directory")};
 
   /// Add an IR printing instrumentation if enabled by any 'print-ir' flags.
   void addPrinterInstrumentation(PassManager &pm);
@@ -120,6 +124,13 @@ void PassManagerOptions::addPrinterInstrumentation(PassManager &pm) {
     return;
 
   // Otherwise, add the IR printing instrumentation.
+  if (!printTreeDir.empty()) {
+    pm.enableIRPrintingToFileTree(shouldPrintBeforePass, shouldPrintAfterPass,
+                                  printModuleScope, printAfterChange,
+                                  printAfterFailure, printTreeDir);
+    return;
+  }
+
   pm.enableIRPrinting(shouldPrintBeforePass, shouldPrintAfterPass,
                       printModuleScope, printAfterChange, printAfterFailure,
                       llvm::errs());
diff --git a/mlir/lib/Target/LLVM/CMakeLists.txt b/mlir/lib/Target/LLVM/CMakeLists.txt
index e0657c895e8a..5a3fa160850b 100644
--- a/mlir/lib/Target/LLVM/CMakeLists.txt
+++ b/mlir/lib/Target/LLVM/CMakeLists.txt
@@ -47,7 +47,7 @@ add_mlir_dialect_library(MLIRNVVMTarget
   MLIRNVVMToLLVMIRTranslation
   )
 
-if(MLIR_ENABLE_CUDA_CONVERSIONS)
+if ("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
   # Find the CUDA toolkit.
   find_package(CUDAToolkit)
 
diff --git a/mlir/lib/Target/LLVM/NVVM/Target.cpp b/mlir/lib/Target/LLVM/NVVM/Target.cpp
index e438ce84af1b..e75547ff9b85 100644
--- a/mlir/lib/Target/LLVM/NVVM/Target.cpp
+++ b/mlir/lib/Target/LLVM/NVVM/Target.cpp
@@ -13,7 +13,6 @@
 
 #include "mlir/Target/LLVM/NVVM/Target.h"
 
-#include "mlir/Config/mlir-config.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Target/LLVM/NVVM/Utils.h"
@@ -158,40 +157,43 @@ SerializeGPUModuleBase::loadBitcodeFiles(llvm::Module &module) {
   return std::move(bcFiles);
 }
 
-#if MLIR_ENABLE_CUDA_CONVERSIONS
+#if LLVM_HAS_NVPTX_TARGET
 namespace {
 class NVPTXSerializer : public SerializeGPUModuleBase {
 public:
   NVPTXSerializer(Operation &module, NVVMTargetAttr target,
                   const gpu::TargetOptions &targetOptions);
 
+  /// Returns the GPU module op being serialized.
   gpu::GPUModuleOp getOperation();
 
-  // Compile PTX to cubin using `ptxas`.
+  /// Compiles PTX to cubin using `ptxas`.
   std::optional<SmallVector<char, 0>>
   compileToBinary(const std::string &ptxCode);
 
-  // Compile PTX to cubin using the `nvptxcompiler` library.
+  /// Compiles PTX to cubin using the `nvptxcompiler` library.
   std::optional<SmallVector<char, 0>>
   compileToBinaryNVPTX(const std::string &ptxCode);
 
+  /// Serializes the LLVM module to an object format, depending on the
+  /// compilation target selected in target options.
   std::optional<SmallVector<char, 0>>
   moduleToObject(llvm::Module &llvmModule) override;
 
 private:
   using TmpFile = std::pair<llvm::SmallString<128>, llvm::FileRemover>;
 
-  // Create a temp file.
+  /// Creates a temp file.
   std::optional<TmpFile> createTemp(StringRef name, StringRef suffix);
 
-  // Find the `tool` path, where `tool` is the name of the binary to search,
-  // i.e. `ptxas` or `fatbinary`. The search order is:
-  // 1. The toolkit path in `targetOptions`.
-  // 2. In the system PATH.
-  // 3. The path from `getCUDAToolkitPath()`.
+  /// Finds the `tool` path, where `tool` is the name of the binary to search,
+  /// i.e. `ptxas` or `fatbinary`. The search order is:
+  /// 1. The toolkit path in `targetOptions`.
+  /// 2. In the system PATH.
+  /// 3. The path from `getCUDAToolkitPath()`.
   std::optional<std::string> findTool(StringRef tool);
 
-  // Target options.
+  /// Target options.
   gpu::TargetOptions targetOptions;
 };
 } // namespace
@@ -515,7 +517,7 @@ NVPTXSerializer::compileToBinaryNVPTX(const std::string &ptxCode) {
 
 std::optional<SmallVector<char, 0>>
 NVPTXSerializer::moduleToObject(llvm::Module &llvmModule) {
-  // Return LLVM IR if the compilation target is offload.
+  // Return LLVM IR if the compilation target is `offload`.
 #define DEBUG_TYPE "serialize-to-llvm"
   LLVM_DEBUG({
     llvm::dbgs() << "LLVM IR for module: " << getOperation().getNameAttr()
@@ -549,7 +551,7 @@ NVPTXSerializer::moduleToObject(llvm::Module &llvmModule) {
   });
 #undef DEBUG_TYPE
 
-  // Return PTX if the compilation target is assembly.
+  // Return PTX if the compilation target is `assembly`.
   if (targetOptions.getCompilationTarget() ==
       gpu::CompilationTarget::Assembly) {
     // Make sure to include the null terminator.
@@ -564,7 +566,7 @@ NVPTXSerializer::moduleToObject(llvm::Module &llvmModule) {
   return compileToBinary(*serializedISA);
 #endif // MLIR_ENABLE_NVPTXCOMPILER
 }
-#endif // MLIR_ENABLE_CUDA_CONVERSIONS
+#endif // LLVM_HAS_NVPTX_TARGET
 
 std::optional<SmallVector<char, 0>>
 NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
@@ -576,7 +578,7 @@ NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
     module->emitError("Module must be a GPU module.");
     return std::nullopt;
   }
-#if MLIR_ENABLE_CUDA_CONVERSIONS
+#if LLVM_HAS_NVPTX_TARGET
   NVPTXSerializer serializer(*module, cast<NVVMTargetAttr>(attribute), options);
   serializer.init();
   return serializer.run();
@@ -584,7 +586,7 @@ NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
   module->emitError(
       "The `NVPTX` target was not built. Please enable it when building LLVM.");
   return std::nullopt;
-#endif // MLIR_ENABLE_CUDA_CONVERSIONS
+#endif // LLVM_HAS_NVPTX_TARGET
 }
 
 Attribute
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenACC/OpenACCToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenACC/OpenACCToLLVMIRTranslation.cpp
index eeda245ce969..d9cf85e4aeca 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenACC/OpenACCToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenACC/OpenACCToLLVMIRTranslation.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Target/LLVMIR/Dialect/OpenACC/OpenACCToLLVMIRTranslation.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -19,7 +20,6 @@
 #include "mlir/Support/LLVM.h"
 #include "mlir/Target/LLVMIR/Dialect/OpenMPCommon.h"
 #include "mlir/Target/LLVMIR/ModuleTranslation.h"
-#include "mlir/Transforms/RegionUtils.h"
 
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 34b6903f8da0..6ec4c120c11e 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 #include "mlir/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/Dialect/OpenMP/OpenMPInterfaces.h"
@@ -333,54 +334,6 @@ convertOmpCritical(Operation &opInst, llvm::IRBuilderBase &builder,
   return success();
 }
 
-/// Returns a reduction declaration that corresponds to the given reduction
-/// operation in the given container. Currently only supports reductions inside
-/// WsloopOp and ParallelOp but can be easily extended as long as the given
-/// construct implements getNumReductionVars.
-template <typename T>
-static std::optional<omp::DeclareReductionOp>
-findReductionDeclInContainer(T container, omp::ReductionOp reduction) {
-  for (unsigned i = 0, e = container.getNumReductionVars(); i < e; ++i) {
-    if (container.getReductionVars()[i] != reduction.getAccumulator())
-      continue;
-
-    SymbolRefAttr reductionSymbol =
-        cast<SymbolRefAttr>((*container.getReductions())[i]);
-    auto declareOp =
-        SymbolTable::lookupNearestSymbolFrom<omp::DeclareReductionOp>(
-            container, reductionSymbol);
-    return declareOp;
-  }
-  return std::nullopt;
-}
-
-/// Searches for a reduction in a provided region and the regions
-/// it is nested in
-static omp::DeclareReductionOp findReductionDecl(Operation &containerOp,
-                                                 omp::ReductionOp reduction) {
-  std::optional<omp::DeclareReductionOp> declareOp = std::nullopt;
-  Operation *container = &containerOp;
-
-  while (!declareOp.has_value() && container) {
-    // Check if current container is supported for reductions searches
-    if (auto par = dyn_cast<omp::ParallelOp>(*container)) {
-      declareOp = findReductionDeclInContainer(par, reduction);
-    } else if (auto loop = dyn_cast<omp::WsloopOp>(*container)) {
-      declareOp = findReductionDeclInContainer(loop, reduction);
-    } else {
-      break;
-    }
-
-    // See if we can search parent for reductions as well
-    container = containerOp.getParentOp();
-  }
-
-  assert(declareOp.has_value() &&
-         "reduction operation must be associated with a declaration");
-
-  return *declareOp;
-}
-
 /// Populates `reductions` with reduction declarations used in the given loop.
 template <typename T>
 static void
@@ -1785,62 +1738,6 @@ convertOmpAtomicCapture(omp::AtomicCaptureOp atomicCaptureOp,
   return updateGenStatus;
 }
 
-/// Converts an OpenMP reduction operation using OpenMPIRBuilder. Expects the
-/// mapping between reduction variables and their private equivalents to have
-/// been stored on the ModuleTranslation stack. Currently only supports
-/// reduction within WsloopOp and ParallelOp, but can be easily extended.
-static LogicalResult
-convertOmpReductionOp(omp::ReductionOp reductionOp,
-                      llvm::IRBuilderBase &builder,
-                      LLVM::ModuleTranslation &moduleTranslation) {
-  // Find the declaration that corresponds to the reduction op.
-  omp::DeclareReductionOp declaration;
-  Operation *reductionParent = reductionOp->getParentOp();
-  if (dyn_cast<omp::ParallelOp>(reductionParent) ||
-      dyn_cast<omp::WsloopOp>(reductionParent)) {
-    declaration = findReductionDecl(*reductionParent, reductionOp);
-  } else {
-    llvm_unreachable("Unhandled reduction container");
-  }
-  assert(declaration && "could not find reduction declaration");
-
-  // Retrieve the mapping between reduction variables and their private
-  // equivalents.
-  const DenseMap<Value, llvm::Value *> *reductionVariableMap = nullptr;
-  moduleTranslation.stackWalk<OpenMPVarMappingStackFrame>(
-      [&](const OpenMPVarMappingStackFrame &frame) {
-        if (frame.mapping.contains(reductionOp.getAccumulator())) {
-          reductionVariableMap = &frame.mapping;
-          return WalkResult::interrupt();
-        }
-        return WalkResult::advance();
-      });
-  assert(reductionVariableMap && "couldn't find private reduction variables");
-  // Translate the reduction operation by emitting the body of the corresponding
-  // reduction declaration.
-  Region &reductionRegion = declaration.getReductionRegion();
-  llvm::Value *privateReductionVar =
-      reductionVariableMap->lookup(reductionOp.getAccumulator());
-  llvm::Value *reductionVal = builder.CreateLoad(
-      moduleTranslation.convertType(reductionOp.getOperand().getType()),
-      privateReductionVar);
-
-  moduleTranslation.mapValue(reductionRegion.front().getArgument(0),
-                             reductionVal);
-  moduleTranslation.mapValue(
-      reductionRegion.front().getArgument(1),
-      moduleTranslation.lookupValue(reductionOp.getOperand()));
-
-  SmallVector<llvm::Value *> phis;
-  if (failed(inlineConvertOmpRegions(reductionRegion, "omp.reduction.body",
-                                     builder, moduleTranslation, &phis)))
-    return failure();
-  assert(phis.size() == 1 && "expected one value to be yielded from "
-                             "the reduction body declaration region");
-  builder.CreateStore(phis[0], privateReductionVar);
-  return success();
-}
-
 /// Converts an OpenMP Threadprivate operation into LLVM IR using
 /// OpenMPIRBuilder.
 static LogicalResult
@@ -3349,9 +3246,6 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder,
       .Case([&](omp::ParallelOp op) {
         return convertOmpParallel(op, builder, moduleTranslation);
       })
-      .Case([&](omp::ReductionOp reductionOp) {
-        return convertOmpReductionOp(reductionOp, builder, moduleTranslation);
-      })
       .Case([&](omp::MasterOp) {
         return convertOmpMaster(*op, builder, moduleTranslation);
       })
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index cf3257c8b9b8..1ec0736ec08b 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -16,6 +16,7 @@
 #include "AttrKindDetail.h"
 #include "DebugTranslation.h"
 #include "LoopAnnotationTranslation.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMInterfaces.h"
@@ -33,7 +34,6 @@
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Target/LLVMIR/LLVMTranslationInterface.h"
 #include "mlir/Target/LLVMIR/TypeToLLVM.h"
-#include "mlir/Transforms/RegionUtils.h"
 
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
diff --git a/mlir/lib/Transforms/Mem2Reg.cpp b/mlir/lib/Transforms/Mem2Reg.cpp
index e2e240ad865c..a452cc3fae8a 100644
--- a/mlir/lib/Transforms/Mem2Reg.cpp
+++ b/mlir/lib/Transforms/Mem2Reg.cpp
@@ -9,6 +9,7 @@
 #include "mlir/Transforms/Mem2Reg.h"
 #include "mlir/Analysis/DataLayoutAnalysis.h"
 #include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/PatternMatch.h"
@@ -16,7 +17,6 @@
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/MemorySlotInterfaces.h"
 #include "mlir/Transforms/Passes.h"
-#include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/GenericIteratedDominanceFrontier.h"
 
diff --git a/mlir/lib/Transforms/SROA.cpp b/mlir/lib/Transforms/SROA.cpp
index 67cbade07bc9..39f7256fb789 100644
--- a/mlir/lib/Transforms/SROA.cpp
+++ b/mlir/lib/Transforms/SROA.cpp
@@ -9,6 +9,7 @@
 #include "mlir/Transforms/SROA.h"
 #include "mlir/Analysis/DataLayoutAnalysis.h"
 #include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Interfaces/MemorySlotInterfaces.h"
 #include "mlir/Transforms/Passes.h"
 
diff --git a/mlir/lib/Transforms/TopologicalSort.cpp b/mlir/lib/Transforms/TopologicalSort.cpp
index 1219968fb369..528f6ef67602 100644
--- a/mlir/lib/Transforms/TopologicalSort.cpp
+++ b/mlir/lib/Transforms/TopologicalSort.cpp
@@ -8,8 +8,8 @@
 
 #include "mlir/Transforms/Passes.h"
 
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/IR/RegionKindInterface.h"
-#include "mlir/Transforms/TopologicalSortUtils.h"
 
 namespace mlir {
 #define GEN_PASS_DEF_TOPOLOGICALSORT
diff --git a/mlir/lib/Transforms/Utils/CMakeLists.txt b/mlir/lib/Transforms/Utils/CMakeLists.txt
index d6aac0e2da4f..b5788c679edc 100644
--- a/mlir/lib/Transforms/Utils/CMakeLists.txt
+++ b/mlir/lib/Transforms/Utils/CMakeLists.txt
@@ -10,7 +10,6 @@ add_mlir_library(MLIRTransformUtils
   LoopInvariantCodeMotionUtils.cpp
   OneToNTypeConversion.cpp
   RegionUtils.cpp
-  TopologicalSortUtils.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
diff --git a/mlir/lib/Transforms/Utils/RegionUtils.cpp b/mlir/lib/Transforms/Utils/RegionUtils.cpp
index 192f59b35329..b5e641d39fc0 100644
--- a/mlir/lib/Transforms/Utils/RegionUtils.cpp
+++ b/mlir/lib/Transforms/Utils/RegionUtils.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Transforms/RegionUtils.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/IR/Block.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Operation.h"
@@ -15,11 +16,9 @@
 #include "mlir/IR/Value.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
-#include "mlir/Transforms/TopologicalSortUtils.h"
 
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SmallSet.h"
 
 #include <deque>
 
@@ -836,19 +835,3 @@ LogicalResult mlir::simplifyRegions(RewriterBase &rewriter,
   return success(eliminatedBlocks || eliminatedOpsOrArgs ||
                  mergedIdenticalBlocks);
 }
-
-SetVector<Block *> mlir::getBlocksSortedByDominance(Region &region) {
-  // For each block that has not been visited yet (i.e. that has no
-  // predecessors), add it to the list as well as its successors.
-  SetVector<Block *> blocks;
-  for (Block &b : region) {
-    if (blocks.count(&b) == 0) {
-      llvm::ReversePostOrderTraversal<Block *> traversal(&b);
-      blocks.insert(traversal.begin(), traversal.end());
-    }
-  }
-  assert(blocks.size() == region.getBlocks().size() &&
-         "some blocks are not sorted");
-
-  return blocks;
-}
diff --git a/mlir/lib/Transforms/ViewOpGraph.cpp b/mlir/lib/Transforms/ViewOpGraph.cpp
index c2eb2b893cea..b3c0a06c96fe 100644
--- a/mlir/lib/Transforms/ViewOpGraph.cpp
+++ b/mlir/lib/Transforms/ViewOpGraph.cpp
@@ -8,12 +8,12 @@
 
 #include "mlir/Transforms/ViewOpGraph.h"
 
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/IR/Block.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/IndentedOstream.h"
-#include "mlir/Transforms/TopologicalSortUtils.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/GraphWriter.h"
 #include <map>
diff --git a/mlir/python/mlir/dialects/linalg/__init__.py b/mlir/python/mlir/dialects/linalg/__init__.py
index 6e4cb1bd6267..8fb1227ee80f 100644
--- a/mlir/python/mlir/dialects/linalg/__init__.py
+++ b/mlir/python/mlir/dialects/linalg/__init__.py
@@ -55,7 +55,6 @@ from .._linalg_enum_gen import *
 #     TODO: guard against surprises and fail create Runtime Custom Ops with
 #     the same name as existing Core Named Ops.
 from .opdsl.ops.core_named_ops import *
-from .opdsl.lang.emitter import isa
 
 from ...ir import *
 from .._ods_common import get_op_result_or_value as _get_op_result_or_value
@@ -71,7 +70,7 @@ def transpose(
     if len(outs) > 1:
         raise ValueError(f"{outs=} must have length 1.")
     init = _get_op_result_or_value(outs[0])
-    result_types = [init.type] if isa(RankedTensorType, init.type) else []
+    result_types = [init.type] if isinstance(init.type, RankedTensorType) else []
 
     op = TransposeOp(
         result=result_types,
@@ -93,7 +92,7 @@ def broadcast(
     if len(outs) > 1:
         raise ValueError(f"{outs=} must have length 1.")
     init = _get_op_result_or_value(outs[0])
-    result_types = [init.type] if isa(RankedTensorType, init.type) else []
+    result_types = [init.type] if isinstance(init.type, RankedTensorType) else []
 
     op = BroadcastOp(
         result=result_types,
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/lang/emitter.py b/mlir/python/mlir/dialects/linalg/opdsl/lang/emitter.py
index 845b533db52a..254458a97882 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/lang/emitter.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/lang/emitter.py
@@ -31,14 +31,6 @@ __all__ = [
 ValueList = Union[Sequence[Value], OpResultList]
 
 
-def isa(cls: Type, ty: Type):
-    try:
-        cls(ty)
-        return True
-    except ValueError:
-        return False
-
-
 def prepare_common_structured_op(
     op_config: LinalgStructuredOpConfig,
     *ins: Value,
@@ -127,7 +119,7 @@ def prepare_common_structured_op(
         op_config, in_arg_defs, ins, out_arg_defs, outs
     )
 
-    result_types = [t for t in out_types if isa(RankedTensorType, t)]
+    result_types = [t for t in out_types if isinstance(t, RankedTensorType)]
 
     # Initialize the type dictionary with the predefined types.
     type_mapping = dict()  # type: Dict[str, Type]
diff --git a/mlir/test/Analysis/DataFlow/test-next-access.mlir b/mlir/test/Analysis/DataFlow/test-next-access.mlir
index 8825c699dd13..700a23aa8bc4 100644
--- a/mlir/test/Analysis/DataFlow/test-next-access.mlir
+++ b/mlir/test/Analysis/DataFlow/test-next-access.mlir
@@ -63,7 +63,7 @@ func.func @branch(%arg0: memref<f32>, %arg1: f32, %arg2: i1) -> f32 {
   return %phi : f32
 }
 
-// CHECK-LABEL @dead_branch
+// CHECK-LABEL: @dead_branch
 func.func @dead_branch(%arg0: memref<f32>, %arg1: f32) -> f32 {
   // CHECK:      name = "store"
   // CHECK-SAME: next_access = ["unknown", ["load 2"]]
@@ -191,7 +191,7 @@ func.func @loop_cf(%arg0: memref<?xf32>, %arg1: f32, %arg2: index, %arg3: index,
   return %8 : f32
 }
 
-// CHECK-LABEL @conditional_cf
+// CHECK-LABEL: @conditional_cf
 func.func @conditional_cf(%arg0: i1, %arg1: memref<f32>) {
   // CHECK:      name = "pre"
   // CHECK-SAME: next_access = {{\[}}["then", "post"]]
diff --git a/mlir/test/Analysis/test-liveness.mlir b/mlir/test/Analysis/test-liveness.mlir
index 8ae3d09a6cd1..61a1e5fffa88 100644
--- a/mlir/test/Analysis/test-liveness.mlir
+++ b/mlir/test/Analysis/test-liveness.mlir
@@ -493,3 +493,27 @@ func.func @nested_region3(
   }
   return %1 : i32
 }
+
+// -----
+
+// CHECK-LABEL: Testing : nested_region4
+
+func.func @nested_region4(%arg0: index, %arg1: index, %arg2: index) {
+  // CHECK: Block: 0
+  // CHECK-NEXT: LiveIn:{{ *$}}
+  // CHECK-NEXT: LiveOut:{{ *$}}
+
+  // CHECK: {{^// +}}[[VAL3:[a-z0-9_]+]]{{ *:}}
+  // CHECK: {{^// +}}[[VAL4:[a-z0-9_]+]]{{ *:}}
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i32 = arith.constant 1 : i32
+
+  %0 = scf.for %arg3 = %arg0 to %arg1 step %arg2 iter_args(%arg4 = %c0_i32) -> (i32) {
+    // CHECK: Block: 1
+    // CHECK-NEXT: LiveIn: [[VAL4]]{{ *$}}
+    // CHECK-NEXT: LiveOut:{{ *$}}
+    %1 = arith.addi %arg4, %c1_i32 : i32
+    scf.yield %1 : i32
+  }
+  return
+}
diff --git a/mlir/test/Analysis/test-topoligical-sort.mlir b/mlir/test/Analysis/test-topoligical-sort.mlir
index 860858640205..150aff854fc8 100644
--- a/mlir/test/Analysis/test-topoligical-sort.mlir
+++ b/mlir/test/Analysis/test-topoligical-sort.mlir
@@ -1,21 +1,38 @@
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(test-print-topological-sort))" 2>&1 | FileCheck %s
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(test-print-topological-sort))" --split-input-file | FileCheck %s
 
-// CHECK-LABEL: Testing : region
-//       CHECK: arith.addi {{.*}} : index
-//  CHECK-NEXT: scf.for
-//       CHECK: } {__test_sort_original_idx__ = 2 : i64}
-//  CHECK-NEXT: arith.addi {{.*}} : i32
-//  CHECK-NEXT: arith.subi {{.*}} : i32
-func.func @region(
-  %arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index,
-  %arg4 : i32, %arg5 : i32, %arg6 : i32,
-  %buffer : memref<i32>) {
-  %0 = arith.addi %arg4, %arg5 {__test_sort_original_idx__ = 0} : i32
-  %idx = arith.addi %arg0, %arg1 {__test_sort_original_idx__ = 3} : index
-  scf.for %arg7 = %idx to %arg2 step %arg3  {
-    %2 = arith.addi %0, %arg5 : i32
-    %3 = arith.subi %2, %arg6 {__test_sort_original_idx__ = 1} : i32
-    memref.store %3, %buffer[] : memref<i32>
-  } {__test_sort_original_idx__ = 2}
+// CHECK-LABEL: single_element
+func.func @single_element() {
+  // CHECK: test_sort_index = 0
+  return {test_to_sort}
+}
+
+// -----
+
+// CHECK-LABEL: @simple_region
+func.func @simple_region(%cond: i1) {
+  // CHECK: test_sort_index = 0
+  %0 = arith.constant {test_to_sort} 42 : i32
+  scf.if %cond {
+    %1 = arith.addi %0, %0 : i32
+    // CHECK: test_sort_index = 2
+    %2 = arith.subi %0, %1 {test_to_sort} : i32
+  // CHECK: test_sort_index = 1
+  } {test_to_sort}
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @multi_region
+func.func @multi_region(%cond: i1) {
+  scf.if %cond {
+    // CHECK: test_sort_index = 0
+    %0 = arith.constant {test_to_sort} 42 : i32
+  }
+
+  scf.if %cond {
+    // CHECK: test_sort_index = 1
+    %0 = arith.constant {test_to_sort} 24 : i32
+  }
   return
 }
diff --git a/mlir/test/Transforms/test-toposort.mlir b/mlir/test/Analysis/test-toposort.mlir
index c47b885dbec7..c47b885dbec7 100644
--- a/mlir/test/Transforms/test-toposort.mlir
+++ b/mlir/test/Analysis/test-toposort.mlir
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 8806a1dd9223..be0b26ee8aac 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -67,8 +67,8 @@ endif()
 
 llvm_canonicalize_cmake_booleans(
   LLVM_BUILD_EXAMPLES
+  LLVM_HAS_NVPTX_TARGET
   MLIR_ENABLE_BINDINGS_PYTHON
-  MLIR_ENABLE_CUDA_CONVERSIONS
   MLIR_ENABLE_CUDA_RUNNER
   MLIR_ENABLE_ROCM_CONVERSIONS
   MLIR_ENABLE_ROCM_RUNNER
diff --git a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir
index 66dfa8fa3e15..97e4593f97b9 100644
--- a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir
+++ b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc-unsupported.mlir
@@ -63,3 +63,10 @@ func.func @arith_cast_fptoui_i1(%arg0: f32) -> i1 {
   return %t: i1
 }
 
+// -----
+
+func.func @arith_extsi_i1_to_i32(%arg0: i1) {
+  // expected-error @+1 {{failed to legalize operation 'arith.extsi'}}
+  %idx = arith.extsi %arg0 : i1 to i32
+  return
+}
diff --git a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
index 79fecd61494d..b453b69a214e 100644
--- a/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
+++ b/mlir/test/Conversion/ArithToEmitC/arith-to-emitc.mlir
@@ -177,3 +177,66 @@ func.func @arith_int_to_float_cast_ops(%arg0: i8, %arg1: i64) {
 
   return
 }
+
+// -----
+
+func.func @arith_trunci(%arg0: i32) -> i8 {
+  // CHECK-LABEL: arith_trunci
+  // CHECK-SAME: (%[[Arg0:[^ ]*]]: i32)
+  // CHECK: %[[CastUI:.*]] = emitc.cast %[[Arg0]] : i32 to ui32
+  // CHECK: %[[Trunc:.*]] = emitc.cast %[[CastUI]] : ui32 to ui8
+  // CHECK: emitc.cast %[[Trunc]] : ui8 to i8
+  %truncd = arith.trunci %arg0 : i32 to i8
+
+  return %truncd : i8
+}
+
+// -----
+
+func.func @arith_trunci_to_i1(%arg0: i32) -> i1 {
+  // CHECK-LABEL: arith_trunci_to_i1
+  // CHECK-SAME: (%[[Arg0:[^ ]*]]: i32)
+  // CHECK: %[[Const:.*]] = "emitc.constant"
+  // CHECK-SAME: value = 1
+  // CHECK: %[[And:.*]] = emitc.bitwise_and %[[Arg0]], %[[Const]] : (i32, i32) -> i32
+  // CHECK: emitc.cast %[[And]] : i32 to i1
+  %truncd = arith.trunci %arg0 : i32 to i1
+
+  return %truncd : i1
+}
+
+// -----
+
+func.func @arith_extsi(%arg0: i32) {
+  // CHECK-LABEL: arith_extsi
+  // CHECK-SAME: ([[Arg0:[^ ]*]]: i32)
+  // CHECK: emitc.cast [[Arg0]] : i32 to i64
+  %extd = arith.extsi %arg0 : i32 to i64
+
+  return
+}
+
+// -----
+
+func.func @arith_extui(%arg0: i32) {
+  // CHECK-LABEL: arith_extui
+  // CHECK-SAME: (%[[Arg0:[^ ]*]]: i32)
+  // CHECK: %[[Conv0:.*]] = emitc.cast %[[Arg0]] : i32 to ui32
+  // CHECK: %[[Conv1:.*]] = emitc.cast %[[Conv0]] : ui32 to ui64
+  // CHECK: emitc.cast %[[Conv1]] : ui64 to i64
+  %extd = arith.extui %arg0 : i32 to i64
+
+  return
+}
+
+// -----
+
+func.func @arith_extui_i1_to_i32(%arg0: i1) {
+  // CHECK-LABEL: arith_extui_i1_to_i32
+  // CHECK-SAME: (%[[Arg0:[^ ]*]]: i1)
+  // CHECK: %[[Conv0:.*]] = emitc.cast %[[Arg0]] : i1 to ui1
+  // CHECK: %[[Conv1:.*]] = emitc.cast %[[Conv0]] : ui1 to ui32
+  // CHECK: emitc.cast %[[Conv1]] : ui32 to i32
+  %idx = arith.extui %arg0 : i1 to i32
+  return
+}
diff --git a/mlir/test/Conversion/BufferizationToMemRef/bufferization-to-memref.mlir b/mlir/test/Conversion/BufferizationToMemRef/bufferization-to-memref.mlir
index 1eb387ce0e5b..f58a2afa1a89 100644
--- a/mlir/test/Conversion/BufferizationToMemRef/bufferization-to-memref.mlir
+++ b/mlir/test/Conversion/BufferizationToMemRef/bufferization-to-memref.mlir
@@ -79,7 +79,7 @@ func.func @conversion_dealloc_simple(%arg0: memref<2xf32>, %arg1: i1) {
   return
 }
 
-//      CHECk: scf.if [[ARG1]] {
-// CHECk-NEXT:   memref.dealloc [[ARG0]] : memref<2xf32>
-// CHECk-NEXT: }
-// CHECk-NEXT: return
+//      CHECK: scf.if [[ARG1]] {
+// CHECK-NEXT:   memref.dealloc [[ARG0]] : memref<2xf32>
+// CHECK-NEXT: }
+// CHECK-NEXT: return
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index dbf8ead49f78..1b046d32f163 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -778,11 +778,11 @@ func.func @create_tensor_map(%devicePtr2d : memref<64x128xf32>, %devicePtr1d : m
   %crd0 = arith.constant 64 : index
   %crd1 = arith.constant 128 : index
   %devicePtr2d_unranked = memref.cast %devicePtr2d : memref<64x128xf32> to memref<*xf32>
-  // CHECK : llvm.call @mgpuTensorMapEncodeTiledMemref
+  // CHECK: llvm.call @mgpuTensorMapEncodeTiledMemref
   %tensorMap2d = nvgpu.tma.create.descriptor %devicePtr2d_unranked box[%crd0, %crd1] : memref<*xf32> -> !tensorMap2d
 
   %devicePtr1d_unranked = memref.cast %devicePtr1d : memref<128xf32> to memref<*xf32>
-  // CHECK : llvm.call @mgpuTensorMapEncodeTiledMemref
+  // CHECK: llvm.call @mgpuTensorMapEncodeTiledMemref
   %tensorMap1d = nvgpu.tma.create.descriptor %devicePtr1d_unranked box[%crd1] : memref<*xf32> -> !tensorMap1d
   func.return
 }
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index 1d56ca97b737..21947c242461 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -17,7 +17,7 @@ llvm.func @init_mbarrier(%barrier_gen : !llvm.ptr, %barrier : !llvm.ptr<3>, %cou
 llvm.func @init_mbarrier_arrive_expect_tx(%barrier : !llvm.ptr<3>, %txcount : i32, %pred : i1) {
   //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r"
   nvvm.mbarrier.arrive.expect_tx.shared %barrier, %txcount : !llvm.ptr<3>, i32
-  //CHECK :  llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r,b "
+  //CHECK:  llvm.inline_asm has_side_effects asm_dialect = att "@$2 mbarrier.arrive.expect_tx.shared.b64 _, [$0], $1;", "r,r,b"
   nvvm.mbarrier.arrive.expect_tx.shared %barrier, %txcount, predicate = %pred : !llvm.ptr<3>, i32, i1 
   llvm.return
 }
@@ -129,7 +129,7 @@ func.func @tma_load_5d_all(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %bar
 func.func @tma_load_1d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %p : i1) {
   // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2} ], [$3];", "r,l,r,r"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0] : !llvm.ptr<3>, !llvm.ptr
-  // CHECK : llvm.inline_asm has_side_effects asm_dialect = att "@$4 cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2} ], [$3];", "l,r,r,r,b"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$4 cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2} ], [$3];", "r,l,r,r,b"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0] predicate=%p : !llvm.ptr<3>, !llvm.ptr
   return
 }
@@ -138,7 +138,7 @@ func.func @tma_load_1d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier
 func.func @tma_load_2d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %p : i1) {
   // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3} ], [$4];", "r,l,r,r,r"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1] : !llvm.ptr<3>, !llvm.ptr
-  // CHECK : llvm.inline_asm has_side_effects asm_dialect = att "@$5 cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3} ], [$4];", "l,r,r,r,r,b"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$5 cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3} ], [$4];", "r,l,r,r,r,b"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1] predicate=%p  : !llvm.ptr<3>, !llvm.ptr
   return
 }
@@ -147,7 +147,7 @@ func.func @tma_load_2d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier
 func.func @tma_load_3d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %p : i1) {
   // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4} ], [$5];", "r,l,r,r,r,r"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2] : !llvm.ptr<3>, !llvm.ptr
-  // CHECK : llvm.inline_asm has_side_effects asm_dialect = att "@$6 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4}], [$5];", "l,r,r,r,r,r,b"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$6 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4} ], [$5];", "r,l,r,r,r,r,b"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2] predicate=%p  : !llvm.ptr<3>, !llvm.ptr
   return
 }
@@ -156,7 +156,7 @@ func.func @tma_load_3d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier
 func.func @tma_load_4d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %p : i1) {
   // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5} ], [$6];", "r,l,r,r,r,r,r"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3] : !llvm.ptr<3>, !llvm.ptr
-  // CHECK : llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5}], [$6];", "l,r,r,r,r,r,r,b"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5} ], [$6];", "r,l,r,r,r,r,r,b"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3] predicate=%p  : !llvm.ptr<3>, !llvm.ptr
   return
 }
@@ -165,7 +165,7 @@ func.func @tma_load_4d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier
 func.func @tma_load_5d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %p : i1) {
   // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5,$6} ], [$7];", "r,l,r,r,r,r,r,r"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] : !llvm.ptr<3>, !llvm.ptr
-  // CHECK : llvm.inline_asm has_side_effects asm_dialect = att "@$8 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5,$6}], [$7];", "l,r,r,r,r,r,r,r,b"
+  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$8 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5,$6} ], [$7];", "r,l,r,r,r,r,r,r,b"
   nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor,  %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] predicate=%p  : !llvm.ptr<3>, !llvm.ptr
   return
 }
diff --git a/mlir/test/Conversion/PDLToPDLInterp/pdl-to-pdl-interp-matcher.mlir b/mlir/test/Conversion/PDLToPDLInterp/pdl-to-pdl-interp-matcher.mlir
index 92afb765b5ab..ed6407a63239 100644
--- a/mlir/test/Conversion/PDLToPDLInterp/pdl-to-pdl-interp-matcher.mlir
+++ b/mlir/test/Conversion/PDLToPDLInterp/pdl-to-pdl-interp-matcher.mlir
@@ -588,7 +588,7 @@ module @variadic_results_all {
   // CHECK-DAG: %[[OPS:.*]] = pdl_interp.get_users of %[[VAL0]] : !pdl.value
   // CHECK-DAG: pdl_interp.foreach %[[OP:.*]] : !pdl.operation in %[[OPS]]
   // CHECK-DAG:   %[[OPERANDS:.*]] = pdl_interp.get_operands of %[[OP]]
-  // CHECK-DAG    pdl_interp.are_equal %[[VALS]], %[[OPERANDS]] -> ^{{.*}}, ^[[CONTINUE:.*]]
+  // CHECK-DAG:    pdl_interp.are_equal %[[OPERANDS]], %[[VALS]] : !pdl.range<value> -> ^{{.*}}, ^[[CONTINUE:.*]]
   // CHECK-DAG:   pdl_interp.is_not_null %[[OP]]
   // CHECK-DAG:   pdl_interp.check_result_count of %[[OP]] is 0
   pdl.pattern @variadic_results_all : benefit(1) {
@@ -701,7 +701,7 @@ module @common_connector {
   // CHECK-DAG:     pdl_interp.are_equal %[[ROOTA_OP]], %[[VAL0]] : !pdl.value
   // CHECK-DAG:     %[[ROOTB_OP:.*]] = pdl_interp.get_operand 0 of %[[ROOTB]]
   // CHECK-DAG:     pdl_interp.are_equal %[[ROOTB_OP]], %[[VAL0]] : !pdl.value
-  // CHECK-DAG    } -> ^[[CONTA:.*]]
+  // CHECK-DAG:    } -> ^[[CONTA:.*]]
   pdl.pattern @common_connector : benefit(1) {
       %type = type
       %op = operation -> (%type, %type : !pdl.type, !pdl.type)
@@ -742,7 +742,7 @@ module @common_connector_range {
   // CHECK-DAG:     pdl_interp.are_equal %[[ROOTA_OPS]], %[[VALS0]] : !pdl.range<value>
   // CHECK-DAG:     %[[ROOTB_OPS:.*]] = pdl_interp.get_operands of %[[ROOTB]]
   // CHECK-DAG:     pdl_interp.are_equal %[[ROOTB_OPS]], %[[VALS0]] : !pdl.range<value>
-  // CHECK-DAG    } -> ^[[CONTA:.*]]
+  // CHECK-DAG:    } -> ^[[CONTA:.*]]
   pdl.pattern @common_connector_range : benefit(1) {
     %types = types
     %op = operation -> (%types, %types : !pdl.range<type>, !pdl.range<type>)
diff --git a/mlir/test/Conversion/SPIRVToLLVM/spirv-storage-class-mapping.mlir b/mlir/test/Conversion/SPIRVToLLVM/spirv-storage-class-mapping.mlir
index b9c56a3fcffd..980406d775d1 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/spirv-storage-class-mapping.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/spirv-storage-class-mapping.mlir
@@ -91,5 +91,5 @@ spirv.func @pointerCodeSectionINTEL(!spirv.ptr<i1, CodeSectionINTEL>) "None"
 spirv.func @pointerDeviceOnlyINTEL(!spirv.ptr<i1, DeviceOnlyINTEL>) "None"
 
 // CHECK-OPENCL:         llvm.func @pointerHostOnlyINTEL(!llvm.ptr<6>)
-// CHECK-UNKOWN:         llvm.func @pointerHostOnlyINTEL(!llvm.ptr)
+// CHECK-UNKNOWN:        llvm.func @pointerHostOnlyINTEL(!llvm.ptr)
 spirv.func @pointerHostOnlyINTEL(!spirv.ptr<i1, HostOnlyINTEL>) "None"
diff --git a/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir b/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
index cddc4ee38535..a7542086aa76 100644
--- a/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
+++ b/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
@@ -483,6 +483,17 @@ func.func @shuffle(%v0 : vector<1xi32>, %v1: vector<1xi32>) -> vector<2xi32> {
 
 // -----
 
+// CHECK-LABEL: func @interleave
+//  CHECK-SAME: (%[[ARG0:.+]]: vector<2xf32>, %[[ARG1:.+]]: vector<2xf32>)
+//       CHECK: %[[SHUFFLE:.*]] = spirv.VectorShuffle [0 : i32, 2 : i32, 1 : i32, 3 : i32] %[[ARG0]], %[[ARG1]] : vector<2xf32>, vector<2xf32> -> vector<4xf32>
+//       CHECK: return %[[SHUFFLE]]
+func.func @interleave(%a: vector<2xf32>, %b: vector<2xf32>) -> vector<4xf32> {
+  %0 = vector.interleave %a, %b : vector<2xf32>
+  return %0 : vector<4xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @reduction_add
 //  CHECK-SAME: (%[[V:.+]]: vector<4xi32>)
 //       CHECK:   %[[S0:.+]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<4xi32>
diff --git a/mlir/test/Dialect/Affine/slicing-utils.mlir b/mlir/test/Dialect/Affine/slicing-utils.mlir
index 74379978fdf8..0848a924b9d9 100644
--- a/mlir/test/Dialect/Affine/slicing-utils.mlir
+++ b/mlir/test/Dialect/Affine/slicing-utils.mlir
@@ -28,15 +28,15 @@ func.func @slicing_test() {
   // BWD: matched: %[[v1:.*]] {{.*}} backward static slice:
   //
   // FWDBWD: matched: %[[v1:.*]] {{.*}} static slice:
-  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
-  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
-  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
-  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
-  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
-  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
-  // FWDBWD-DAG: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
-  // FWDBWD-DAG: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
-  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+  // FWDBWD: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
 
   %1 = "slicing-test-op" () : () -> i1
 
@@ -49,15 +49,15 @@ func.func @slicing_test() {
   // BWD: matched: %[[v2:.*]] {{.*}} backward static slice:
   //
   // FWDBWD-NEXT: matched: %[[v2:.*]] {{.*}} static slice:
-  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
-  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
-  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
-  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
-  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
-  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
-  // FWDBWD-DAG: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
-  // FWDBWD-DAG: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
-  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+  // FWDBWD: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
 
   %2 = "slicing-test-op" () : () -> i2
 
@@ -69,15 +69,15 @@ func.func @slicing_test() {
   // BWD: matched: %[[v3:.*]] {{.*}} backward static slice:
   //
   // FWDBWD-NEXT: matched: %[[v3:.*]] {{.*}} static slice:
-  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
-  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
-  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
-  // FWDBWD-NEXT: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
-  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
-  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
-  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
-  // FWDBWD-NEXT: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
-  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+  // FWDBWD: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
 
   %3 = "slicing-test-op" () : () -> i3
 
@@ -89,15 +89,15 @@ func.func @slicing_test() {
   // BWD: matched: %[[v4:.*]] {{.*}} backward static slice:
   //
   // FWDBWD-NEXT: matched: %[[v4:.*]] {{.*}} static slice:
-  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
-  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
-  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
-  // FWDBWD-NEXT: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
-  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
-  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
-  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
-  // FWDBWD-NEXT: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
-  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+  // FWDBWD: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
 
   %4 = "slicing-test-op" () : () -> i4
 
@@ -111,15 +111,15 @@ func.func @slicing_test() {
   // BWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
   //
   // FWDBWD-NEXT: matched: %[[v5:.*]] {{.*}} static slice:
-  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
-  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
-  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
-  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
-  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
-  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
-  // FWDBWD-DAG: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
-  // FWDBWD-DAG: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
-  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+  // FWDBWD: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
 
   %5 = "slicing-test-op" (%1, %2) : (i1, i2) -> i5
 
@@ -132,15 +132,15 @@ func.func @slicing_test() {
   // BWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
   //
   // FWDBWD-NEXT: matched: %[[v6:.*]] {{.*}} static slice:
-  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
-  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
-  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
-  // FWDBWD-NEXT: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
-  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
-  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
-  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
-  // FWDBWD-NEXT: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
-  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+  // FWDBWD: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
 
   %6 = "slicing-test-op" (%3, %4) : (i3, i4) -> i6
 
@@ -153,15 +153,15 @@ func.func @slicing_test() {
   // BWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
   //
   // FWDBWD-NEXT: matched: %[[v7:.*]] {{.*}} static slice:
-  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
-  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
-  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
-  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
-  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD: %[[v4:.*]] = "slicing-test-op"() : () -> i4
   // FWDBWD: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
-  // FWDBWD-DAG: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
-  // FWDBWD-DAG: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
-  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+  // FWDBWD: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
 
   %7 = "slicing-test-op" (%1, %5) : (i1, i5) -> i7
 
@@ -177,15 +177,15 @@ func.func @slicing_test() {
   // BWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
   //
   // FWDBWD-NEXT: matched: %[[v8:.*]] {{.*}} static slice:
-  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
-  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
-  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
-  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
-  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
-  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
-  // FWDBWD-DAG: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
-  // FWDBWD-DAG: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
-  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+  // FWDBWD: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
 
   %8 = "slicing-test-op" (%5, %6) : (i5, i6) -> i8
 
@@ -202,15 +202,15 @@ func.func @slicing_test() {
   // BWD-NEXT: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
   //
   // FWDBWD-NEXT: matched: %[[v9:.*]] {{.*}} static slice:
-  // FWDBWD-DAG: %[[v4:.*]] = "slicing-test-op"() : () -> i4
-  // FWDBWD-DAG: %[[v3:.*]] = "slicing-test-op"() : () -> i3
-  // FWDBWD-NEXT: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
-  // FWDBWD-DAG: %[[v2:.*]] = "slicing-test-op"() : () -> i2
-  // FWDBWD-DAG: %[[v1:.*]] = "slicing-test-op"() : () -> i1
-  // FWDBWD-NEXT: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
-  // FWDBWD-DAG: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
-  // FWDBWD-DAG: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
-  // FWDBWD-NEXT: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
+  // FWDBWD: %[[v1:.*]] = "slicing-test-op"() : () -> i1
+  // FWDBWD: %[[v2:.*]] = "slicing-test-op"() : () -> i2
+  // FWDBWD: %[[v3:.*]] = "slicing-test-op"() : () -> i3
+  // FWDBWD: %[[v4:.*]] = "slicing-test-op"() : () -> i4
+  // FWDBWD: %[[v5:.*]] = "slicing-test-op"(%[[v1]], %[[v2]]) : (i1, i2) -> i5
+  // FWDBWD: %[[v6:.*]] = "slicing-test-op"(%[[v3]], %[[v4]]) : (i3, i4) -> i6
+  // FWDBWD: %[[v7:.*]] = "slicing-test-op"(%[[v1]], %[[v5]]) : (i1, i5) -> i7
+  // FWDBWD: %[[v8:.*]] = "slicing-test-op"(%[[v5]], %[[v6]]) : (i5, i6) -> i8
+  // FWDBWD: %[[v9:.*]] = "slicing-test-op"(%[[v7]], %[[v8]]) : (i7, i8) -> i9
 
   %9 = "slicing-test-op" (%7, %8) : (i7, i8) -> i9
 
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index e4f95bb0545a..1a387c20c4b2 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -2950,6 +2950,14 @@ func.func @unsignedExtendConstantResource() -> tensor<i16> {
   return %ext : tensor<i16>
 }
 
+// Just checks that this doesn't crash.
+// CHECK-LABEL: @signedExtendSplatAsDynamicShape
+func.func @signedExtendSplatAsDynamicShape() -> tensor<?xi64> {
+  %splat = arith.constant dense<5> : tensor<2xi16>
+  %extsplat = arith.extsi %splat : tensor<2xi16> to tensor<?xi64>
+  return %extsplat : tensor<?xi64>
+}
+
 // CHECK-LABEL: @extsi_i0
 //       CHECK:   %[[ZERO:.*]] = arith.constant 0 : i16
 //       CHECK:   return %[[ZERO]] : i16
diff --git a/mlir/test/Dialect/Arith/int-range-interface.mlir b/mlir/test/Dialect/Arith/int-range-interface.mlir
index 16524b363472..5b538197a0c1 100644
--- a/mlir/test/Dialect/Arith/int-range-interface.mlir
+++ b/mlir/test/Dialect/Arith/int-range-interface.mlir
@@ -758,7 +758,7 @@ func.func private @callee(%arg0: memref<?xindex, 4>) {
 }
 
 // CHECK-LABEL: func @test_i8_bounds
-// CHECK: test.reflect_bounds {smax = 127 : i8, smin = -128 : i8, umax = -1 : i8, umin = 0 : i8}
+// CHECK: test.reflect_bounds {smax = 127 : si8, smin = -128 : si8, umax = 255 : ui8, umin = 0 : ui8}
 func.func @test_i8_bounds() -> i8 {
   %cst1 = arith.constant 1 : i8
   %0 = test.with_bounds { umin = 0 : i8, umax = 255 : i8, smin = -128 : i8, smax = 127 : i8 } : i8
@@ -766,3 +766,136 @@ func.func @test_i8_bounds() -> i8 {
   %2 = test.reflect_bounds %1 : i8
   return %2: i8
 }
+
+// CHECK-LABEL: func @test_add_1
+// CHECK: test.reflect_bounds {smax = 127 : si8, smin = -128 : si8, umax = 255 : ui8, umin = 0 : ui8}
+func.func @test_add_1() -> i8 {
+  %cst1 = arith.constant 1 : i8
+  %0 = test.with_bounds { umin = 0 : i8, umax = 255 : i8, smin = -128 : i8, smax = 127 : i8 } : i8
+  %1 = arith.addi %0, %cst1 : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
+
+// Tests below check inference with overflow flags.
+
+// CHECK-LABEL: func @test_add_i8_wrap1
+// CHECK: test.reflect_bounds {smax = 127 : si8, smin = -128 : si8, umax = 128 : ui8, umin = 1 : ui8}
+func.func @test_add_i8_wrap1() -> i8 {
+  %cst1 = arith.constant 1 : i8
+  %0 = test.with_bounds { umin = 0 : i8, umax = 127 : i8, smin = 0 : i8, smax = 127 : i8 } : i8
+  // smax overflow
+  %1 = arith.addi %0, %cst1 : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
+
+// CHECK-LABEL: func @test_add_i8_wrap2
+// CHECK: test.reflect_bounds {smax = 127 : si8, smin = -128 : si8, umax = 128 : ui8, umin = 1 : ui8}
+func.func @test_add_i8_wrap2() -> i8 {
+  %cst1 = arith.constant 1 : i8
+  %0 = test.with_bounds { umin = 0 : i8, umax = 127 : i8, smin = 0 : i8, smax = 127 : i8 } : i8
+  // smax overflow
+  %1 = arith.addi %0, %cst1 overflow<nuw> : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
+
+// CHECK-LABEL: func @test_add_i8_nowrap
+// CHECK: test.reflect_bounds {smax = 127 : si8, smin = 1 : si8, umax = 127 : ui8, umin = 1 : ui8}
+func.func @test_add_i8_nowrap() -> i8 {
+  %cst1 = arith.constant 1 : i8
+  %0 = test.with_bounds { umin = 0 : i8, umax = 127 : i8, smin = 0 : i8, smax = 127 : i8 } : i8
+  // nsw flag stops smax from overflowing
+  %1 = arith.addi %0, %cst1 overflow<nsw> : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
+
+// CHECK-LABEL: func @test_sub_i8_wrap1
+// CHECK: test.reflect_bounds {smax = 5 : si8, smin = -10 : si8, umax = 255 : ui8, umin = 0 : ui8} %1 : i8
+func.func @test_sub_i8_wrap1() -> i8 {
+  %cst10 = arith.constant 10 : i8
+  %0 = test.with_bounds { umin = 0 : i8, umax = 15 : i8, smin = 0 : i8, smax = 15 : i8 } : i8
+  // umin underflows
+  %1 = arith.subi %0, %cst10 : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
+
+// CHECK-LABEL: func @test_sub_i8_wrap2
+// CHECK: test.reflect_bounds {smax = 5 : si8, smin = -10 : si8, umax = 255 : ui8, umin = 0 : ui8} %1 : i8
+func.func @test_sub_i8_wrap2() -> i8 {
+  %cst10 = arith.constant 10 : i8
+  %0 = test.with_bounds { umin = 0 : i8, umax = 15 : i8, smin = 0 : i8, smax = 15 : i8 } : i8
+  // umin underflows
+  %1 = arith.subi %0, %cst10 overflow<nsw> : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
+
+// CHECK-LABEL: func @test_sub_i8_nowrap
+// CHECK: test.reflect_bounds {smax = 5 : si8, smin = 0 : si8, umax = 5 : ui8, umin = 0 : ui8}
+func.func @test_sub_i8_nowrap() -> i8 {
+  %cst10 = arith.constant 10 : i8
+  %0 = test.with_bounds { umin = 0 : i8, umax = 15 : i8, smin = 0 : i8, smax = 15 : i8 } : i8
+  // nuw flag stops umin from underflowing
+  %1 = arith.subi %0, %cst10 overflow<nuw> : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
+
+// CHECK-LABEL: func @test_mul_i8_wrap
+// CHECK: test.reflect_bounds {smax = 127 : si8, smin = -128 : si8, umax = 200 : ui8, umin = 100 : ui8}
+func.func @test_mul_i8_wrap() -> i8 {
+  %cst10 = arith.constant 10 : i8
+  %0 = test.with_bounds { umin = 10 : i8, umax = 20 : i8, smin = 10 : i8, smax = 20 : i8 } : i8
+  // smax overflows
+  %1 = arith.muli %0, %cst10 : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
+
+// CHECK-LABEL: func @test_mul_i8_nowrap
+// CHECK: test.reflect_bounds {smax = 127 : si8, smin = 100 : si8, umax = 127 : ui8, umin = 100 : ui8}
+func.func @test_mul_i8_nowrap() -> i8 {
+  %cst10 = arith.constant 10 : i8
+  %0 = test.with_bounds { umin = 10 : i8, umax = 20 : i8, smin = 10 : i8, smax = 20 : i8 } : i8
+  // nsw stops overflow
+  %1 = arith.muli %0, %cst10 overflow<nsw> : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
+
+// CHECK-LABEL: func @test_shl_i8_wrap1
+// CHECK: test.reflect_bounds {smax = 127 : si8, smin = -128 : si8, umax = 160 : ui8, umin = 80 : ui8}
+func.func @test_shl_i8_wrap1() -> i8 {
+  %cst3 = arith.constant 3 : i8
+  %0 = test.with_bounds { umin = 10 : i8, umax = 20 : i8, smin = 10 : i8, smax = 20 : i8 } : i8
+  // smax overflows
+  %1 = arith.shli %0, %cst3 : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
+
+// CHECK-LABEL: func @test_shl_i8_wrap2
+// CHECK: test.reflect_bounds {smax = 127 : si8, smin = -128 : si8, umax = 160 : ui8, umin = 80 : ui8}
+func.func @test_shl_i8_wrap2() -> i8 {
+  %cst3 = arith.constant 3 : i8
+  %0 = test.with_bounds { umin = 10 : i8, umax = 20 : i8, smin = 10 : i8, smax = 20 : i8 } : i8
+  // smax overflows
+  %1 = arith.shli %0, %cst3 overflow<nuw> : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
+
+// CHECK-LABEL: func @test_shl_i8_nowrap
+// CHECK: test.reflect_bounds {smax = 127 : si8, smin = 80 : si8, umax = 127 : ui8, umin = 80 : ui8}
+func.func @test_shl_i8_nowrap() -> i8 {
+  %cst3 = arith.constant 3 : i8
+  %0 = test.with_bounds { umin = 10 : i8, umax = 20 : ui8, smin = 10 : i8, smax = 20 : i8 } : i8
+  // nsw stops smax overflow
+  %1 = arith.shli %0, %cst3 overflow<nsw> : i8
+  %2 = test.reflect_bounds %1 : i8
+  return %2: i8
+}
diff --git a/mlir/test/Dialect/Arith/int-range-opts.mlir b/mlir/test/Dialect/Arith/int-range-opts.mlir
index 6179003ab4e7..dd62a481a124 100644
--- a/mlir/test/Dialect/Arith/int-range-opts.mlir
+++ b/mlir/test/Dialect/Arith/int-range-opts.mlir
@@ -75,7 +75,7 @@ func.func @test() -> i1 {
 // -----
 
 // CHECK-LABEL: func @test
-// CHECK: test.reflect_bounds {smax = 24 : i8, smin = 0 : i8, umax = 24 : i8, umin = 0 : i8}
+// CHECK: test.reflect_bounds {smax = 24 : si8, smin = 0 : si8, umax = 24 : ui8, umin = 0 : ui8}
 func.func @test() -> i8 {
   %cst1 = arith.constant 1 : i8
   %i8val = test.with_bounds { umin = 0 : i8, umax = 12 : i8, smin = 0 : i8, smax = 12 : i8 } : i8
@@ -87,7 +87,7 @@ func.func @test() -> i8 {
 // -----
 
 // CHECK-LABEL: func @test
-// CHECK: test.reflect_bounds {smax = 127 : i8, smin = -128 : i8, umax = -1 : i8, umin = 0 : i8}
+// CHECK: test.reflect_bounds {smax = 127 : si8, smin = -128 : si8, umax = 254 : ui8, umin = 0 : ui8}
 func.func @test() -> i8 {
   %cst1 = arith.constant 1 : i8
   %i8val = test.with_bounds { umin = 0 : i8, umax = 127 : i8, smin = 0 : i8, smax = 127 : i8 } : i8
diff --git a/mlir/test/Dialect/Arith/unsigned-when-equivalent.mlir b/mlir/test/Dialect/Arith/unsigned-when-equivalent.mlir
index ce77d3d2f425..49bd74cfe912 100644
--- a/mlir/test/Dialect/Arith/unsigned-when-equivalent.mlir
+++ b/mlir/test/Dialect/Arith/unsigned-when-equivalent.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt -arith-unsigned-when-equivalent %s | FileCheck %s
 
-// CHECK-LABEL func @not_with_maybe_overflow
+// CHECK-LABEL: func @not_with_maybe_overflow
 // CHECK: arith.divsi
 // CHECK: arith.ceildivsi
 // CHECK: arith.floordivsi
@@ -32,7 +32,7 @@ func.func @not_with_maybe_overflow(%arg0 : i32) {
     func.return
 }
 
-// CHECK-LABEL func @yes_with_no_overflow
+// CHECK-LABEL: func @yes_with_no_overflow
 // CHECK: arith.divui
 // CHECK: arith.ceildivui
 // CHECK: arith.divui
diff --git a/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir b/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir
index 88fc8a8923d3..fe4c005c7c42 100644
--- a/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir
+++ b/mlir/test/Dialect/ArmSME/tile-allocation-liveness.mlir
@@ -366,15 +366,15 @@ func.func @avoidable_spill(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %c: vector<
 
 //  CHECK-LIVE-RANGE-LABEL: @cond_branch_with_backedge
 //        CHECK-LIVE-RANGE: ^bb1:
-//  CHECK-LIVE-RANGE--NEXT:  ||| |           arith.cmpi
-//  CHECK-LIVE-RANGE--NEXT:  EEE E           cf.cond_br
+//   CHECK-LIVE-RANGE-NEXT:  ||| |           arith.cmpi
+//   CHECK-LIVE-RANGE-NEXT:  EEE E           cf.cond_br
 //
-//  CHECK-LIVE-RANGE--NEXT: ^[[BB3_COPIES:[[:alnum:]]+]]:
-//  CHECK-LIVE-RANGE--NEXT:  ||| ES          arm_sme.copy_tile
-//  CHECK-LIVE-RANGE--NEXT:  E||  |S         arm_sme.copy_tile
-//  CHECK-LIVE-RANGE--NEXT:   E|  ||S        arm_sme.copy_tile
-//  CHECK-LIVE-RANGE--NEXT:    E  |||S       arm_sme.copy_tile
-//  CHECK-LIVE-RANGE--NEXT:       EEEE       cf.br
+//   CHECK-LIVE-RANGE-NEXT: ^[[BB3_COPIES:[[:alnum:]]+]]:
+//   CHECK-LIVE-RANGE-NEXT:  ||| ES          arm_sme.copy_tile
+//   CHECK-LIVE-RANGE-NEXT:  E||  |S         arm_sme.copy_tile
+//   CHECK-LIVE-RANGE-NEXT:   E|  ||S        arm_sme.copy_tile
+//   CHECK-LIVE-RANGE-NEXT:    E  |||S       arm_sme.copy_tile
+//   CHECK-LIVE-RANGE-NEXT:       EEEE       cf.br
 //
 // It is important to note that the first three live ranges in ^bb1 do not end
 // at the `cf.cond_br` they are live-out via the backedge bb1 -> bb2 -> bb1.
@@ -389,15 +389,15 @@ func.func @avoidable_spill(%a: vector<[4]xf32>, %b: vector<[4]xf32>, %c: vector<
 //
 //        CHECK-LIVE-RANGE: ========== Coalesced Live Ranges:
 //        CHECK-LIVE-RANGE: ^bb1:
-//  CHECK-LIVE-RANGE--NEXT: |||| arith.cmpi
-//  CHECK-LIVE-RANGE--NEXT: EEEE cf.cond_br
+//   CHECK-LIVE-RANGE-NEXT: |||| arith.cmpi
+//   CHECK-LIVE-RANGE-NEXT: EEEE cf.cond_br
 //
-//  CHECK-LIVE-RANGE--NEXT: ^[[BB3_COPIES]]:
-//  CHECK-LIVE-RANGE--NEXT: |||| arm_sme.copy_tile
-//  CHECK-LIVE-RANGE--NEXT: |||| arm_sme.copy_tile
-//  CHECK-LIVE-RANGE--NEXT: |||| arm_sme.copy_tile
-//  CHECK-LIVE-RANGE--NEXT: |||| arm_sme.copy_tile
-//  CHECK-LIVE-RANGE--NEXT: EEEE cf.br
+//   CHECK-LIVE-RANGE-NEXT: ^[[BB3_COPIES]]:
+//   CHECK-LIVE-RANGE-NEXT: |||| arm_sme.copy_tile
+//   CHECK-LIVE-RANGE-NEXT: |||| arm_sme.copy_tile
+//   CHECK-LIVE-RANGE-NEXT: |||| arm_sme.copy_tile
+//   CHECK-LIVE-RANGE-NEXT: |||| arm_sme.copy_tile
+//   CHECK-LIVE-RANGE-NEXT: EEEE cf.br
 
 // CHECK-LABEL: @cond_branch_with_backedge
 // CHECK-NOT: tile_id = 16
diff --git a/mlir/test/Dialect/Bufferization/Transforms/lower-deallocations-func.mlir b/mlir/test/Dialect/Bufferization/Transforms/lower-deallocations-func.mlir
index 03cf10aa0c05..3de3a6a693cf 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/lower-deallocations-func.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/lower-deallocations-func.mlir
@@ -9,10 +9,10 @@ func.func @conversion_dealloc_simple(%arg0: memref<2xf32>, %arg1: i1) {
   return
 }
 
-//      CHECk: scf.if [[ARG1]] {
-// CHECk-NEXT:   memref.dealloc [[ARG0]] : memref<2xf32>
-// CHECk-NEXT: }
-// CHECk-NEXT: return
+//      CHECK: scf.if [[ARG1]] {
+// CHECK-NEXT:   memref.dealloc [[ARG0]] : memref<2xf32>
+// CHECK-NEXT: }
+// CHECK-NEXT: return
 
 // -----
 
diff --git a/mlir/test/Dialect/Bufferization/Transforms/lower-deallocations.mlir b/mlir/test/Dialect/Bufferization/Transforms/lower-deallocations.mlir
index 2c69fcab08a8..5fedd45555fc 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/lower-deallocations.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/lower-deallocations.mlir
@@ -29,10 +29,10 @@ func.func @conversion_dealloc_simple(%arg0: memref<2xf32>, %arg1: i1) {
   return
 }
 
-//      CHECk: scf.if [[ARG1]] {
-// CHECk-NEXT:   memref.dealloc [[ARG0]] : memref<2xf32>
-// CHECk-NEXT: }
-// CHECk-NEXT: return
+//      CHECK: scf.if [[ARG1]] {
+// CHECK-NEXT:   memref.dealloc [[ARG0]] : memref<2xf32>
+// CHECK-NEXT: }
+// CHECK-NEXT: return
 
 // -----
 
diff --git a/mlir/test/Dialect/GPU/barrier-elimination.mlir b/mlir/test/Dialect/GPU/barrier-elimination.mlir
index 844dc7dd6ac0..1f5b84937deb 100644
--- a/mlir/test/Dialect/GPU/barrier-elimination.mlir
+++ b/mlir/test/Dialect/GPU/barrier-elimination.mlir
@@ -61,7 +61,7 @@ func.func @write_in_a_loop(%arg0: memref<?xf32>, %arg1: f32) attributes {__paral
   return
 }
 
-// CHECK-LABEL @read_read_write_loop
+// CHECK-LABEL: @read_read_write_loop
 func.func @read_read_write_loop(%arg0: memref<?xf32>, %arg1: f32) attributes {__parallel_region_boundary_for_test} {
   %c0 = arith.constant 0 : index
   %c42 = arith.constant 42 : index
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index 511b01887747..ba7897f4e80c 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -227,7 +227,7 @@ module attributes {gpu.container_module} {
       gpu.return
     }
 
-    // CHECK-LABEL gpu.func @printf_test
+    // CHECK-LABEL: gpu.func @printf_test
     // CHECK: (%[[ARG0:.*]]: i32)
     // CHECK: gpu.printf "Value: %d" %[[ARG0]] : i32
     gpu.func @printf_test(%arg0 : i32) {
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index 5e4724c9d309..47ebe326b5d1 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -123,7 +123,7 @@ llvm.func @launch_from_llvm_func() {
   llvm.return
 }
 
-// CHECK-DL-LABLE: gpu.module @launch_from_llvm_func_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+// CHECK-DL-LABEL: gpu.module @launch_from_llvm_func_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
 
 // -----
 
diff --git a/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir b/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
index 07e719798b85..732f40c4333d 100644
--- a/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
+++ b/mlir/test/Dialect/GPU/test-nvvm-pipeline.mlir
@@ -27,4 +27,4 @@ func.func @test_math(%arg0 : f32) {
         gpu.terminator
     }
     return
-}
-\ No newline at end of file
+}
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index de2904d15b64..a7bdceba01c1 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -464,24 +464,24 @@ llvm.func private @mbarrier_test_wait_shared(%barrier: !llvm.ptr<3>, %token : i6
   llvm.return
 }
 
-// CHECK-LABEL : @wgmma_fence_aligned
+// CHECK-LABEL: @wgmma_fence_aligned
 func.func @wgmma_fence_aligned() {
-  // CHECK : nvvm.wgmma.fence.aligned
+  // CHECK: nvvm.wgmma.fence.aligned
   nvvm.wgmma.fence.aligned
   return
 }
 
-// CHECK-LABEL : @wgmma_commit_group_sync_aligned
+// CHECK-LABEL: @wgmma_commit_group_sync_aligned
 func.func @wgmma_commit_group_sync_aligned() {
-  // CHECK : nvvm.wgmma.commit.group.sync.aligned
+  // CHECK: nvvm.wgmma.commit.group.sync.aligned
   nvvm.wgmma.commit.group.sync.aligned
   return
 }
 
 
-// CHECK-LABEL : @wgmma_commit_group_sync_aligned
+// CHECK-LABEL: @wgmma_wait_group_sync_aligned
 func.func @wgmma_wait_group_sync_aligned() {
-  // CHECK : nvvm.wgmma.wait.group.sync.aligned
+  // CHECK: nvvm.wgmma.wait.group.sync.aligned
   nvvm.wgmma.wait.group.sync.aligned 0
   return
 }
@@ -495,7 +495,7 @@ gpu.module @module_1 [#nvvm.target<chip = "sm_90", features = "+ptx70", link = [
 gpu.module @module_2 [#nvvm.target<chip = "sm_90">, #nvvm.target<chip = "sm_80">, #nvvm.target<chip = "sm_70">] {
 }
 
-// CHECK-LABEL : nvvm.grid_constant
+// CHECK-LABEL: nvvm.grid_constant
 llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}) attributes {nvvm.kernel} {
   llvm.return
 }
diff --git a/mlir/test/Dialect/LLVMIR/type-consistency.mlir b/mlir/test/Dialect/LLVMIR/type-consistency.mlir
deleted file mode 100644
index c9c1355d16df..000000000000
--- a/mlir/test/Dialect/LLVMIR/type-consistency.mlir
+++ /dev/null
@@ -1,533 +0,0 @@
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(llvm.func(llvm-type-consistency))" --split-input-file | FileCheck %s
-
-// CHECK-LABEL: llvm.func @same_address
-llvm.func @same_address(%arg: i32) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32, i32, i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, i32, i32)> : (i32) -> !llvm.ptr
-  // CHECK: = llvm.getelementptr %[[ALLOCA]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, i32, i32)>
-  %7 = llvm.getelementptr %1[8] : (!llvm.ptr) -> !llvm.ptr, i8
-  llvm.store %arg, %7 : i32, !llvm.ptr
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @same_address_keep_inbounds
-llvm.func @same_address_keep_inbounds(%arg: i32) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32, i32, i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, i32, i32)> : (i32) -> !llvm.ptr
-  // CHECK: = llvm.getelementptr inbounds %[[ALLOCA]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, i32, i32)>
-  %7 = llvm.getelementptr inbounds %1[8] : (!llvm.ptr) -> !llvm.ptr, i8
-  llvm.store %arg, %7 : i32, !llvm.ptr
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @index_in_final_padding
-llvm.func @index_in_final_padding(%arg: i32) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32, i8)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, i8)> : (i32) -> !llvm.ptr
-  // CHECK: = llvm.getelementptr %[[ALLOCA]][7] : (!llvm.ptr) -> !llvm.ptr, i8
-  %7 = llvm.getelementptr %1[7] : (!llvm.ptr) -> !llvm.ptr, i8
-  llvm.store %arg, %7 : i32, !llvm.ptr
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @index_out_of_bounds
-llvm.func @index_out_of_bounds(%arg: i32) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32, i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, i32)> : (i32) -> !llvm.ptr
-  // CHECK: = llvm.getelementptr %[[ALLOCA]][9] : (!llvm.ptr) -> !llvm.ptr, i8
-  %7 = llvm.getelementptr %1[9] : (!llvm.ptr) -> !llvm.ptr, i8
-  llvm.store %arg, %7 : i32, !llvm.ptr
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @index_in_padding
-llvm.func @index_in_padding(%arg: i16) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i16, i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i16, i32)> : (i32) -> !llvm.ptr
-  // CHECK: = llvm.getelementptr %[[ALLOCA]][2] : (!llvm.ptr) -> !llvm.ptr, i8
-  %7 = llvm.getelementptr %1[2] : (!llvm.ptr) -> !llvm.ptr, i8
-  llvm.store %arg, %7 : i16, !llvm.ptr
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @index_not_in_padding_because_packed
-llvm.func @index_not_in_padding_because_packed(%arg: i16) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", packed (i16, i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", packed (i16, i32)> : (i32) -> !llvm.ptr
-  // CHECK: = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", packed (i16, i32)>
-  %7 = llvm.getelementptr %1[2] : (!llvm.ptr) -> !llvm.ptr, i8
-  llvm.store %arg, %7 : i16, !llvm.ptr
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @no_crash_on_negative_gep_index
-llvm.func @no_crash_on_negative_gep_index() {
-  %0 = llvm.mlir.constant(1.000000e+00 : f16) : f16
-  %1 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32, i32, i32)>
-  %2 = llvm.alloca %1 x !llvm.struct<"foo", (i32, i32, i32)> : (i32) -> !llvm.ptr
-  // CHECK: llvm.getelementptr %[[ALLOCA]][-1] : (!llvm.ptr) -> !llvm.ptr, f32
-  %3 = llvm.getelementptr %2[-1] : (!llvm.ptr) -> !llvm.ptr, f32
-  llvm.store %0, %3 : f16, !llvm.ptr
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @coalesced_store_ints
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @coalesced_store_ints(%arg: i64) {
-  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK-DAG: %[[CST32:.*]] = llvm.mlir.constant(32 : i64) : i64
-
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32, i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, i32)> : (i32) -> !llvm.ptr
-
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST0]]
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i32
-  // CHECK: llvm.store %[[TRUNC]], %[[ALLOCA]]
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST32]] : i64
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i32
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<"foo", (i32, i32)>
-  // CHECK: llvm.store %[[TRUNC]], %[[GEP]]
-  llvm.store %arg, %1 : i64, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @coalesced_store_ints_offset
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @coalesced_store_ints_offset(%arg: i64) {
-  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK-DAG: %[[CST32:.*]] = llvm.mlir.constant(32 : i64) : i64
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i64, i32, i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i64, i32, i32)> : (i32) -> !llvm.ptr
-  %3 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i64, i32, i32)>
-
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST0]]
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i32
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i64, i32, i32)>
-  // CHECK: llvm.store %[[TRUNC]], %[[GEP]]
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST32]] : i64
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i32
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 2] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<"foo", (i64, i32, i32)>
-  // CHECK: llvm.store %[[TRUNC]], %[[GEP]]
-  llvm.store %arg, %3 : i64, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @coalesced_store_floats
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @coalesced_store_floats(%arg: i64) {
-  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK-DAG: %[[CST32:.*]] = llvm.mlir.constant(32 : i64) : i64
-  %0 = llvm.mlir.constant(1 : i32) : i32
-
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (f32, f32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (f32, f32)> : (i32) -> !llvm.ptr
-
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST0]]
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i32
-  // CHECK: llvm.store %[[TRUNC]], %[[ALLOCA]]
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST32]] : i64
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i32
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<"foo", (f32, f32)>
-  // CHECK: llvm.store %[[TRUNC]], %[[GEP]]
-  llvm.store %arg, %1 : i64, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// Padding test purposefully not modified.
-
-// CHECK-LABEL: llvm.func @coalesced_store_padding_inbetween
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @coalesced_store_padding_inbetween(%arg: i64) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i16, i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i16, i32)> : (i32) -> !llvm.ptr
-  // CHECK: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.store %arg, %1 : i64, !llvm.ptr
-  llvm.return
-}
-
-// -----
-
-// Padding test purposefully not modified.
-
-// CHECK-LABEL: llvm.func @coalesced_store_padding_end
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @coalesced_store_padding_end(%arg: i64) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32, i16)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, i16)> : (i32) -> !llvm.ptr
-  // CHECK: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.store %arg, %1 : i64, !llvm.ptr
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @coalesced_store_past_end
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @coalesced_store_past_end(%arg: i64) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32)> : (i32) -> !llvm.ptr
-  // CHECK: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.store %arg, %1 : i64, !llvm.ptr
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @coalesced_store_packed_struct
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @coalesced_store_packed_struct(%arg: i64) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK-DAG: %[[CST16:.*]] = llvm.mlir.constant(16 : i64) : i64
-  // CHECK-DAG: %[[CST48:.*]] = llvm.mlir.constant(48 : i64) : i64
-
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", packed (i16, i32, i16)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", packed (i16, i32, i16)> : (i32) -> !llvm.ptr
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST0]]
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i16
-  // CHECK: llvm.store %[[TRUNC]], %[[ALLOCA]]
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST16]]
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i32
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", packed (i16, i32, i16)>
-  // CHECK: llvm.store %[[TRUNC]], %[[GEP]]
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST48]]
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i16
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", packed (i16, i32, i16)>
-  // CHECK: llvm.store %[[TRUNC]], %[[GEP]]
-  llvm.store %arg, %1 : i64, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @vector_write_split
-// CHECK-SAME: %[[ARG:.*]]: vector<4xi32>
-llvm.func @vector_write_split(%arg: vector<4xi32>) {
-  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i32) : i32
-  // CHECK-DAG: %[[CST1:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[CST2:.*]] = llvm.mlir.constant(2 : i32) : i32
-  // CHECK-DAG: %[[CST3:.*]] = llvm.mlir.constant(3 : i32) : i32
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32, i32, i32, i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, i32, i32, i32)> : (i32) -> !llvm.ptr
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST0]] : i32] : vector<4xi32>
-  // CHECK: llvm.store %[[EXTRACT]], %[[ALLOCA]] : i32, !llvm.ptr
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST1]] : i32] : vector<4xi32>
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, i32, i32, i32)>
-  // CHECK: llvm.store %[[EXTRACT]], %[[GEP]] : i32, !llvm.ptr
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST2]] : i32] : vector<4xi32>
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, i32, i32, i32)>
-  // CHECK: llvm.store %[[EXTRACT]], %[[GEP]] : i32, !llvm.ptr
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST3]] : i32] : vector<4xi32>
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, i32, i32, i32)>
-  // CHECK: llvm.store %[[EXTRACT]], %[[GEP]] : i32, !llvm.ptr
-
-  llvm.store %arg, %1 : vector<4xi32>, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @vector_write_split_offset
-// CHECK-SAME: %[[ARG:.*]]: vector<4xi32>
-llvm.func @vector_write_split_offset(%arg: vector<4xi32>) {
-  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i32) : i32
-  // CHECK-DAG: %[[CST1:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[CST2:.*]] = llvm.mlir.constant(2 : i32) : i32
-  // CHECK-DAG: %[[CST3:.*]] = llvm.mlir.constant(3 : i32) : i32
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i64, i32, i32, i32, i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i64, i32, i32, i32, i32)> : (i32) -> !llvm.ptr
-  %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i64, i32, i32, i32, i32)>
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST0]] : i32] : vector<4xi32>
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i64, i32, i32, i32, i32)>
-  // CHECK: llvm.store %[[EXTRACT]], %[[GEP]] : i32, !llvm.ptr
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST1]] : i32] : vector<4xi32>
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i64, i32, i32, i32, i32)>
-  // CHECK: llvm.store %[[EXTRACT]], %[[GEP]] : i32, !llvm.ptr
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST2]] : i32] : vector<4xi32>
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i64, i32, i32, i32, i32)>
-  // CHECK: llvm.store %[[EXTRACT]], %[[GEP]] : i32, !llvm.ptr
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST3]] : i32] : vector<4xi32>
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 4] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i64, i32, i32, i32, i32)>
-  // CHECK: llvm.store %[[EXTRACT]], %[[GEP]] : i32, !llvm.ptr
-
-  llvm.store %arg, %2 : vector<4xi32>, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// Small test that a split vector store will be further optimized (to than e.g.
-// split integer loads to structs as shown here)
-
-// CHECK-LABEL: llvm.func @vector_write_split_struct
-// CHECK-SAME: %[[ARG:.*]]: vector<2xi64>
-llvm.func @vector_write_split_struct(%arg: vector<2xi64>) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32, i32, i32, i32)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, i32, i32, i32)> : (i32) -> !llvm.ptr
-
-  // CHECK-COUNT-4: llvm.store %{{.*}}, %{{.*}} : i32, !llvm.ptr
-
-  llvm.store %arg, %1 : vector<2xi64>, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @gep_split
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @gep_split(%arg: i64) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.array<2 x struct<"foo", (i64)>>
-  %1 = llvm.alloca %0 x !llvm.array<2 x struct<"foo", (i64)>> : (i32) -> !llvm.ptr
-  %3 = llvm.getelementptr %1[0, 1, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<2 x struct<"foo", (i64)>>
-  // CHECK: %[[TOP_GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<2 x struct<"foo", (i64)>>
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[TOP_GEP]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i64)>
-  // CHECK: llvm.store %[[ARG]], %[[GEP]]
-  llvm.store %arg, %3 : i64, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @coalesced_store_ints_subaggregate
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @coalesced_store_ints_subaggregate(%arg: i64) {
-  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK-DAG: %[[CST32:.*]] = llvm.mlir.constant(32 : i64) : i64
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i64, struct<(i32, i32)>)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i64, struct<(i32, i32)>)> : (i32) -> !llvm.ptr
-  %3 = llvm.getelementptr %1[0, 1, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i64, struct<(i32, i32)>)>
-
-  // CHECK: %[[TOP_GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i64, struct<(i32, i32)>)>
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST0]]
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i32
-  // CHECK: llvm.store %[[TRUNC]], %[[TOP_GEP]]
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST32]] : i64
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i32
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[TOP_GEP]][0, 1] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<(i32, i32)>
-  // CHECK: llvm.store %[[TRUNC]], %[[GEP]]
-  llvm.store %arg, %3 : i64, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @gep_result_ptr_type_dynamic
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @gep_result_ptr_type_dynamic(%arg: i64) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.array<2 x struct<"foo", (i64)>>
-  %1 = llvm.alloca %0 x !llvm.array<2 x struct<"foo", (i64)>> : (i32) -> !llvm.ptr
-  %3 = llvm.getelementptr %1[0, %arg, 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.array<2 x struct<"foo", (i64)>>
-  // CHECK: %[[TOP_GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, %[[ARG]]] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.array<2 x struct<"foo", (i64)>>
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[TOP_GEP]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i64)>
-  // CHECK: llvm.store %[[ARG]], %[[GEP]]
-  llvm.store %arg, %3 : i64, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @overlapping_int_aggregate_store
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @overlapping_int_aggregate_store(%arg: i64) {
-  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK-DAG: %[[CST16:.*]] = llvm.mlir.constant(16 : i64) : i64
-
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i16, struct<(i16, i16, i16)>)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i16, struct<(i16, i16, i16)>)> : (i32) -> !llvm.ptr
-
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST0]]
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i16
-  // CHECK: llvm.store %[[TRUNC]], %[[ALLOCA]]
-
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST16]] : i64
-  // CHECK: [[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i48
-  // CHECK: %[[TOP_GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<"foo", (i16, struct<(i16, i16, i16)>)>
-
-  // Normal integer splitting of [[TRUNC]] follows:
-
-  // CHECK: llvm.store %{{.*}}, %[[TOP_GEP]]
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[TOP_GEP]][0, 1] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<(i16, i16, i16)>
-  // CHECK: llvm.store %{{.*}}, %[[GEP]]
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[TOP_GEP]][0, 2] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<(i16, i16, i16)>
-  // CHECK: llvm.store %{{.*}}, %[[GEP]]
-
-  llvm.store %arg, %1 : i64, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @overlapping_vector_aggregate_store
-// CHECK-SAME: %[[ARG:.*]]: vector<4xi16>
-llvm.func @overlapping_vector_aggregate_store(%arg: vector<4 x i16>) {
-  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i32) : i32
-  // CHECK-DAG: %[[CST1:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-DAG: %[[CST2:.*]] = llvm.mlir.constant(2 : i32) : i32
-  // CHECK-DAG: %[[CST3:.*]] = llvm.mlir.constant(3 : i32) : i32
-
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i16, struct<(i16, i16, i16)>)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i16, struct<(i16, i16, i16)>)> : (i32) -> !llvm.ptr
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST0]] : i32]
-  // CHECK: llvm.store %[[EXTRACT]], %[[ALLOCA]]
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST1]] : i32]
-  // CHECK: %[[GEP0:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<"foo", (i16, struct<(i16, i16, i16)>)>
-  // CHECK: llvm.store %[[EXTRACT]], %[[GEP0]]
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST2]] : i32]
-  // CHECK: %[[GEP0:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<"foo", (i16, struct<(i16, i16, i16)>)>
-  // CHECK: %[[GEP1:.*]] = llvm.getelementptr %[[GEP0]][0, 1] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<(i16, i16, i16)>
-  // CHECK: llvm.store %[[EXTRACT]], %[[GEP1]]
-
-  // CHECK: %[[EXTRACT:.*]] = llvm.extractelement %[[ARG]][%[[CST3]] : i32]
-  // CHECK: %[[GEP0:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<"foo", (i16, struct<(i16, i16, i16)>)>
-  // CHECK: %[[GEP1:.*]] = llvm.getelementptr %[[GEP0]][0, 2] : (!llvm.ptr)  -> !llvm.ptr, !llvm.struct<(i16, i16, i16)>
-  // CHECK: llvm.store %[[EXTRACT]], %[[GEP1]]
-
-  llvm.store %arg, %1 : vector<4 x i16>, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @partially_overlapping_aggregate_store
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @partially_overlapping_aggregate_store(%arg: i64) {
-  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK-DAG: %[[CST16:.*]] = llvm.mlir.constant(16 : i64) : i64
-
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i16, struct<(i16, i16, i16, i16)>)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i16, struct<(i16, i16, i16, i16)>)> : (i32) -> !llvm.ptr
-
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST0]]
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i16
-  // CHECK: llvm.store %[[TRUNC]], %[[ALLOCA]]
-
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST16]] : i64
-  // CHECK: [[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i48
-  // CHECK: %[[TOP_GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i16, struct<(i16, i16, i16, i16)>)>
-
-  // Normal integer splitting of [[TRUNC]] follows:
-
-  // CHECK: llvm.store %{{.*}}, %[[TOP_GEP]]
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[TOP_GEP]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i16, i16, i16, i16)>
-  // CHECK: llvm.store %{{.*}}, %[[GEP]]
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[TOP_GEP]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i16, i16, i16, i16)>
-  // CHECK: llvm.store %{{.*}}, %[[GEP]]
-
-  // It is important that there are no more stores at this point.
-  // Specifically a store into the fourth field of %[[TOP_GEP]] would
-  // incorrectly change the semantics of the code.
-  // CHECK-NOT: llvm.store %{{.*}}, %{{.*}}
-
-  llvm.store %arg, %1 : i64, !llvm.ptr
-
-  llvm.return
-}
-
-// -----
-
-// Here a split is undesirable since the store does a partial store into the field.
-
-// CHECK-LABEL: llvm.func @undesirable_overlapping_aggregate_store
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @undesirable_overlapping_aggregate_store(%arg: i64) {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.struct<"foo", (i32, i32, struct<(i64, i16, i16, i16)>)>
-  %1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, i32, struct<(i64, i16, i16, i16)>)> : (i32) -> !llvm.ptr
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, i32, struct<(i64, i16, i16, i16)>)>
-  %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, i32, struct<(i64, i16, i16, i16)>)>
-  // CHECK: llvm.store %[[ARG]], %[[GEP]]
-  llvm.store %arg, %2 : i64, !llvm.ptr
-
-  llvm.return
-}
-
-// -----
-
-// CHECK-LABEL: llvm.func @coalesced_store_ints_array
-// CHECK-SAME: %[[ARG:.*]]: i64
-llvm.func @coalesced_store_ints_array(%arg: i64) {
-  // CHECK-DAG: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-  // CHECK-DAG: %[[CST32:.*]] = llvm.mlir.constant(32 : i64) : i64
-
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[ALLOCA:.*]] = llvm.alloca %{{.*}} x !llvm.array<2 x i32>
-  %1 = llvm.alloca %0 x !llvm.array<2 x i32> : (i32) -> !llvm.ptr
-
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST0]]
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i32
-  // CHECK: llvm.store %[[TRUNC]], %[[ALLOCA]]
-  // CHECK: %[[SHR:.*]] = llvm.lshr %[[ARG]], %[[CST32]] : i64
-  // CHECK: %[[TRUNC:.*]] = llvm.trunc %[[SHR]] : i64 to i32
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ALLOCA]][0, 1] : (!llvm.ptr)  -> !llvm.ptr, !llvm.array<2 x i32>
-  // CHECK: llvm.store %[[TRUNC]], %[[GEP]]
-  llvm.store %arg, %1 : i64, !llvm.ptr
-  // CHECK-NOT: llvm.store %[[ARG]], %[[ALLOCA]]
-  llvm.return
-}
diff --git a/mlir/test/Dialect/Linalg/block-pack-matmul.mlir b/mlir/test/Dialect/Linalg/block-pack-matmul.mlir
index cc9af913ca15..8a8260817769 100644
--- a/mlir/test/Dialect/Linalg/block-pack-matmul.mlir
+++ b/mlir/test/Dialect/Linalg/block-pack-matmul.mlir
@@ -476,3 +476,32 @@ func.func @block_generic_matmul_transpose_b(
 // CHECK-SAME:  inner_dims_pos = [0, 1] inner_tiles = [32, 16]
 // CHECK-SAME:  into %[[C]] : tensor<2x4x32x16xf32> -> tensor<64x64xf32>
 // CHECK: return %[[RES_UNPACKED]] : tensor<64x64xf32>
+
+// -----
+
+#map = affine_map<(d0, d1) -> (d0, d1)>
+
+func.func @non_contraction_generic(
+    %A: tensor<64x128xf32>) -> tensor<64x128xf32> {
+  %c0 = arith.constant 0.000000e+00 : f32
+  %0 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel"]}
+    outs(%A : tensor<64x128xf32>) {
+  ^bb0(%out: f32):
+    %1 = arith.maximumf %out, %c0 : f32
+    linalg.yield %1 : f32
+  } -> tensor<64x128xf32>
+  return %0 : tensor<64x128xf32>
+}
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1) -> (d0, d1)>
+
+// CHECK-LABEL: func @non_contraction_generic(
+// CHECK-SAME:    %[[A:[0-9a-z]+]]: tensor<64x128xf32>
+// CHECK-DAG: %[[C0:.+]] = arith.constant 0.000000e+00 : f32
+// CHECK-NOT: tensor.pack
+// CHECK: %[[GENERIC:.+]] = linalg.generic
+// CHECK-SAME:  indexing_maps = [#[[$MAP]]]
+// CHECK-SAME:  iterator_types = ["parallel", "parallel"]
+// CHECK-SAME:  outs(%[[A]] : tensor<64x128xf32>)
+// CHECK-NOT: tensor.unpack
+// CHECK: return %[[GENERIC]] : tensor<64x128xf32>
diff --git a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
index bee08503298f..9140904620ac 100644
--- a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
+++ b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
@@ -795,7 +795,7 @@ func.func @reduction_pack_transpose_inner_dims(%arg0: tensor<128x256x32xi32>,
 // CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]
 // CHECK:         %[[ARG1_EMPTY:.+]] = tensor.empty() : tensor<4x16x16x32xi32>
 // CHECK:         %[[PACK_ARG1:.+]] = tensor.pack %[[ARG1]]
-// CHECK-SME:      inner_dims_pos = [1, 0] inner_tiles = [16, 32]
+// CHECK-SAME:     inner_dims_pos = [1, 0] inner_tiles = [16, 32]
 // CHECK-SAME:     into %[[ARG1_EMPTY]]
 // CHECK:         %[[ARG0_EMPTY:.+]] = tensor.empty() : tensor<4x16x32x16x32xi32>
 // CHECK:         %[[PACK_ARG0:.+]] = tensor.pack %[[ARG0]]
diff --git a/mlir/test/Dialect/Linalg/mesh-sharding-propagation.mlir b/mlir/test/Dialect/Linalg/mesh-sharding-propagation.mlir
new file mode 100644
index 000000000000..59fd548dc2ef
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/mesh-sharding-propagation.mlir
@@ -0,0 +1,34 @@
+// RUN: mlir-opt \
+// RUN:   --verify-each \
+// RUN:   --pass-pipeline="builtin.module(func.func(sharding-propagation))" \
+// RUN:   %s | FileCheck %s
+
+mesh.mesh @mesh_2_2(shape = 2)
+
+// CHECK-LABEL: func @matmul_shard_prallel_axis
+func.func @matmul_shard_prallel_axis(
+  // CHECK-SAME: %[[IN1:[A-Za-z0-9_]+]]: tensor<2x3xf32>,
+  %arg0 : tensor<2x3xf32>,
+  // CHECK-SAME: %[[IN2:[A-Za-z0-9_]+]]: tensor<3x2xf32>,
+  %arg1 : tensor<3x2xf32>,
+  // CHECK-SAME: %[[DPS_OUT:[A-Za-z0-9_]+]]: tensor<2x2xf32>
+  %out_dps: tensor<2x2xf32>
+) -> tensor<2x2xf32> {
+  // CHECK: %[[IN1_ANNOTATED_0:.*]] = mesh.shard %[[IN1]] to <@mesh_2, {{\[}}[0]]> : tensor<2x3xf32>
+  // CHECK: %[[IN1_ANNOTATED_1:.*]] = mesh.shard %[[IN1_ANNOTATED_0]] to <@mesh_2, {{\[}}[0]]> annotate_for_users : tensor<2x3xf32>
+  // CHECK: %[[IN2_ANNOTATED:.*]] = mesh.shard %[[IN2]] to <@mesh_2, []> annotate_for_users : tensor<3x2xf32>
+  // CHECK: %[[DPS_OUT_ANNOTATED:.*]] = mesh.shard %[[DPS_OUT]] to <@mesh_2, {{\[}}[0]]> annotate_for_users : tensor<2x2xf32>
+  %arg0_sharded = mesh.shard %arg0 to <@mesh_2, [[0]]> : tensor<2x3xf32>
+
+  // CHECK: %[[RES:.*]] = linalg.matmul ins(%[[IN1_ANNOTATED_1]], %[[IN2_ANNOTATED]] : tensor<2x3xf32>, tensor<3x2xf32>)
+  // CHECK-SAME:  outs(%[[DPS_OUT_ANNOTATED]] : tensor<2x2xf32>) -> tensor<2x2xf32>
+  %res = linalg.matmul ins(%arg0_sharded, %arg1 : tensor<2x3xf32>, tensor<3x2xf32>)
+    outs(%out_dps : tensor<2x2xf32>) -> tensor<2x2xf32>
+
+  // CHECK: %[[RES_ANNOTATED_0:.*]] = mesh.shard %[[RES]] to <@mesh_2, {{\[}}[0]]> : tensor<2x2xf32>
+  // CHECK: %[[RES_ANNOTATED_1:.*]] = mesh.shard %[[RES_ANNOTATED_0]] to <@mesh_2, {{\[}}[]]> annotate_for_users : tensor<2x2xf32>
+  %res_sharded = mesh.shard %res to <@mesh_2, [[]]> annotate_for_users : tensor<2x2xf32>
+
+  // CHECK: return %[[RES_ANNOTATED_1]] : tensor<2x2xf32>
+  return %res_sharded : tensor<2x2xf32>
+}
diff --git a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
index 0e1512717a22..f3cf7c4dffa0 100644
--- a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
+++ b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
@@ -80,13 +80,14 @@ module attributes {transform.with_named_sequence} {
 
 // CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0)[s0] -> (-d0 + s0, 5)>
 // CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1) -> (d1)>
+// CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1) -> (d1, d0)>
+// CHECK-DAG: #[[MAP3:.*]] = affine_map<(d0, d1) -> (d1)>
 //     CHECK: func @reduction_tile_transpose
 //     CHECK:   tensor.empty(%{{.*}}) : tensor<5x?xf32>
 //     CHECK:   linalg.fill {{.*}} : tensor<5x?xf32>) -> tensor<5x?xf32>
 //     CHECK:   scf.for
 //     CHECK:     %[[EXT:.*]] = tensor.extract_slice %[[ARG3:.*]][0, 0] [%[[D0:.*]], %[[D1:.*]]] [1, 1] : tensor<5x?xf32> to tensor<?x?xf32>
-//     CHECK:     %[[R:.*]] = linalg.generic {indexing_maps = [#[[MAP1]], #[[MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%[[L:.*]] : tensor<?x?xf32>) outs(%[[EXT]] : tensor<?x?xf32>)
+//     CHECK:     %[[R:.*]] = linalg.generic {indexing_maps = [#[[MAP1]], #[[MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[L:.*]] : tensor<?x?xf32>) outs(%[[EXT]] : tensor<?x?xf32>)
 //     CHECK:     %[[INS:.*]] = tensor.insert_slice %[[R]] into %[[ARG3]][0, 0] [%[[D0]], %[[D1]]] [1, 1] : tensor<?x?xf32> into tensor<5x?xf32>
 //     CHECK:     scf.yield {{.*}} : tensor<5x?xf32>
 //     CHECK:   }
@@ -403,3 +404,48 @@ module {
 // CHECK:     scf.yield %[[L1]] : tensor<4096x2x64xf32>
 // CHECK:   %[[OUT2:.*]] = linalg.generic {indexing_maps = [{{.*}}, {{.*}}], iterator_types = ["parallel", "reduction", "reduction"]} ins(%{{.*}} : tensor<4096x2x64xf32>) outs(%{{.*}} : tensor<4096xf32>)
 // CHECK:  return %[[OUT2]] : tensor<4096xf32>
+
+// -----
+
+func.func @reduction_tile_multiple_results(%arg0: tensor<?x?xf32>, %out: tensor<?xf32>, %out2: tensor<?xf32>) -> (tensor<?xf32>, tensor<?xf32>) {
+  %red:2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                                            affine_map<(d0, d1) -> (d0)>,
+                                            affine_map<(d0, d1) -> (d0)>],
+   iterator_types = ["parallel", "reduction"]}
+   ins(%arg0 : tensor<?x?xf32>)
+   outs(%out, %out2 : tensor<?xf32>, tensor<?xf32>) {
+    ^bb0(%arg7: f32, %arg9: f32, %arg9_1: f32):
+      %1 = arith.mulf %arg7, %arg7 : f32
+      %2 = arith.addf %1, %arg9 : f32
+      %3 = arith.maximumf %1, %arg9_1 : f32
+      linalg.yield %2, %3 : f32, f32
+    } -> (tensor<?xf32>, tensor<?xf32>)
+  return %red#0, %red#1 : tensor<?xf32>, tensor<?xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1, %12, %2, %3, %loop = transform.structured.tile_reduction_using_for %0
+      by tile_sizes = [0, 5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+      transform.yield
+  }
+}
+
+// CHECK: func @reduction_tile_multiple_results
+// CHECK-DAG:   %[[SUM_ID:.+]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:   %[[MAX_ID:.+]] = arith.constant 0xFF800000 : f32
+// CHECK-DAG:   %[[SUM_INIT:.+]] = linalg.fill ins(%[[SUM_ID]] : f32) outs(%{{.*}} : tensor<?x5xf32>) -> tensor<?x5xf32>
+// CHECK-DAG:   %[[MAX_INIT:.+]] = linalg.fill ins(%[[MAX_ID]] : f32) outs(%{{.*}} : tensor<?x5xf32>) -> tensor<?x5xf32>
+// CHECK:       %[[OUT:.+]]:2 = scf.for
+// CHECK-SAME:            iter_args(%[[SUM:.+]] = %[[SUM_INIT]], %[[MAX:.+]] = %[[MAX_INIT]])
+// CHECK:         %[[UPDATED:.*]]:2 = linalg.generic
+// CHECK:         arith.mulf
+// CHECK:         arith.addf
+// CHECK:         arith.maximumf
+// CHECK:       %[[INSERT1:.+]] = tensor.insert_slice %[[UPDATED]]#0 into %[[SUM]]
+// CHECK:       %[[INSERT2:.+]] = tensor.insert_slice %[[UPDATED]]#1 into %[[MAX]]
+// CHECK:       scf.yield %[[INSERT1]], %[[INSERT1]]
+// CHECK:       linalg.generic
+// CHECK:         arith.addf
+// CHECK:         arith.maximumf
diff --git a/mlir/test/Dialect/Math/expand-math.mlir b/mlir/test/Dialect/Math/expand-math.mlir
index 016a7bbdeb56..c10a78ca4ae4 100644
--- a/mlir/test/Dialect/Math/expand-math.mlir
+++ b/mlir/test/Dialect/Math/expand-math.mlir
@@ -221,7 +221,7 @@ func.func @roundf_func(%a: f32) -> f32 {
 // CHECK-LABEL:   func @powf_func
 // CHECK-SAME:    ([[ARG0:%.+]]: f64, [[ARG1:%.+]]: f64)
 func.func @powf_func(%a: f64, %b: f64) ->f64 {
-  // CHECK-DAG = [[CST0:%.+]] = arith.constant 0.000000e+00
+  // CHECK-DAG: [[CST0:%.+]] = arith.constant 0.000000e+00
   // CHECK-DAG: [[TWO:%.+]] = arith.constant 2.000000e+00
   // CHECK-DAG: [[NEGONE:%.+]] = arith.constant -1.000000e+00
   // CHECK-DAG: [[SQR:%.+]] = arith.mulf [[ARG0]], [[ARG0]]
diff --git a/mlir/test/Dialect/Mesh/sharding-propagation.mlir b/mlir/test/Dialect/Mesh/sharding-propagation.mlir
index 270787ab5188..11a80594adb7 100644
--- a/mlir/test/Dialect/Mesh/sharding-propagation.mlir
+++ b/mlir/test/Dialect/Mesh/sharding-propagation.mlir
@@ -1,5 +1,6 @@
-// RUN: mlir-opt --pass-pipeline="builtin.module(func.func(sharding-propagation))" %s | FileCheck %s
+// RUN: mlir-opt --pass-pipeline="builtin.module(func.func(sharding-propagation,cse))" %s | FileCheck %s
 
+mesh.mesh @mesh_2(shape = 2)
 mesh.mesh @mesh_1d(shape = ?)
 mesh.mesh @mesh_2d(shape = 2x4)
 mesh.mesh @mesh_3d(shape = ?x?x?)
@@ -73,12 +74,11 @@ func.func @arrow_structure(%arg0: tensor<8x16xf32>) -> (tensor<8x16xf32>, tensor
   // CHECK-NEXT:  %[[V5:.*]] = tosa.abs %[[V4]]
   // CHECK-NEXT:  %[[V6:.*]] = mesh.shard %[[V5]] to <@mesh_2d, {{\[\[}}0], [1]]> : tensor<8x16xf32>
   %1 = tosa.abs %0 : (tensor<8x16xf32>) -> tensor<8x16xf32>
-  // CHECK-NEXT:  %[[V7:.*]] = mesh.shard %[[V3]] to <@mesh_2d, {{\[\[}}0], [1]]> annotate_for_users : tensor<8x16xf32>
-  // CHECK-NEXT:  %[[V8:.*]] = tosa.negate %[[V7]]
-  // CHECK-NEXT:  %[[V9:.*]] = mesh.shard %[[V8]] to <@mesh_2d, {{\[\[}}0], [1]]> : tensor<8x16xf32>
+  // CHECK-NEXT:  %[[V7:.*]] = tosa.negate %[[V4]]
+  // CHECK-NEXT:  %[[V8:.*]] = mesh.shard %[[V7]] to <@mesh_2d, {{\[\[}}0], [1]]> : tensor<8x16xf32>
   %2 = tosa.negate %0 : (tensor<8x16xf32>) -> tensor<8x16xf32>
   %3 = mesh.shard %2 to <@mesh_2d, [[0], [1]]> : tensor<8x16xf32>
-  // CHECK-NEXT: return %[[V6]], %[[V9]]
+  // CHECK-NEXT: return %[[V6]], %[[V8]]
   return %1, %3 : tensor<8x16xf32>, tensor<8x16xf32>
 }
 
@@ -135,6 +135,34 @@ func.func @matmul_on_use_shard_m_and_duplicted_k(%arg0: tensor<2x16x8xf32>, %arg
   return %2 : tensor<2x16x32xf32>
 }
 
+// CHECK-LABEL: func.func @resolve_conflicting_annotations
+func.func @resolve_conflicting_annotations(
+  // CHECK-SAME: %[[IN1:.*]]: tensor<2x3xf32>,
+  %arg0: tensor<2x3xf32>,
+  // CHECK-SAME: %[[IN2:.*]]: tensor<3x2xf32>,
+  %arg1: tensor<3x2xf32>,
+  // CHECK-SAME: %[[OUT_DPS:.*]]: tensor<2x2xf32>
+  %out_dps: tensor<2x2xf32>
+// CHECK-SAME: ) -> tensor<2x2xf32> {
+) -> tensor<2x2xf32> {
+  // CHECK: %[[IN1_SHARDED1:.*]] = mesh.shard %[[IN1]] to <@mesh_2, {{\[\[}}0]]> : tensor<2x3xf32>
+  // CHECK: %[[IN1_SHARDED2:.*]] = mesh.shard %[[IN1_SHARDED1]] to <@mesh_2, {{\[}}]> annotate_for_users : tensor<2x3xf32>
+  // CHECK: %[[IN2_SHARDED:.*]] = mesh.shard %[[IN2]] to <@mesh_2, []> annotate_for_users : tensor<3x2xf32>
+  // CHECK: %[[OUT_DPS_SHARDED:.*]] = mesh.shard %[[OUT_DPS]] to <@mesh_2, {{\[}}]> annotate_for_users : tensor<2x2xf32>
+  %arg0_sharded = mesh.shard %arg0 to <@mesh_2, [[0]]> : tensor<2x3xf32>
+
+  // CHECK: %[[MATMUL:.*]] = linalg.matmul ins(%[[IN1_SHARDED2]], %[[IN2_SHARDED]] : tensor<2x3xf32>, tensor<3x2xf32>)
+  // CHECK-SAME: outs(%[[OUT_DPS_SHARDED]] : tensor<2x2xf32>) -> tensor<2x2xf32>
+  %res = linalg.matmul ins(%arg0_sharded, %arg1 : tensor<2x3xf32>, tensor<3x2xf32>)
+    outs(%out_dps : tensor<2x2xf32>) -> tensor<2x2xf32>
+
+  // CHECK: %[[MATMUL_SHARDED1:.*]] = mesh.shard %[[MATMUL]] to <@mesh_2, {{\[\[}}]]> : tensor<2x2xf32>
+  %res_sharded = mesh.shard %res to <@mesh_2, [[]]> : tensor<2x2xf32>
+
+  // CHECK: return %[[MATMUL_SHARDED1]] : tensor<2x2xf32>
+  return %res_sharded : tensor<2x2xf32>
+}
+
 // https://arxiv.org/abs/2211.05102 Figure 2(a)
 // CHECK-LABEL: func.func @mlp_1d_weight_stationary
 // CHECK-SAME:     %[[ARG0:.*]]: tensor<2x4x8xf32>, %[[ARG1:.*]]: tensor<2x8x32xf32>, %[[ARG2:.*]]: tensor<2x32x8xf32>
diff --git a/mlir/test/Dialect/Mesh/spmdization.mlir b/mlir/test/Dialect/Mesh/spmdization.mlir
index 2df247aba351..d7a1e2fd9d27 100644
--- a/mlir/test/Dialect/Mesh/spmdization.mlir
+++ b/mlir/test/Dialect/Mesh/spmdization.mlir
@@ -16,6 +16,21 @@ func.func @full_replication(
   return %1 : tensor<2xi8>
 }
 
+// CHECK-LABEL: func @sharding_triplet
+func.func @sharding_triplet(
+  // CHECK-SAME: %[[ARG:.*]]: tensor<1xf32>
+  %arg0: tensor<2xf32>
+// CHECK-SAME: ) -> tensor<2xf32> {
+) -> tensor<2xf32> {
+  // CHECK: %[[ALL_GATHER:.*]] = mesh.all_gather %[[ARG]] on @mesh_1d mesh_axes = [0] gather_axis = 0 : tensor<1xf32> -> tensor<2xf32>
+  %sharding_annotated = mesh.shard %arg0 to <@mesh_1d, [[0]]> : tensor<2xf32>
+  %sharding_annotated_0 = mesh.shard %sharding_annotated to <@mesh_1d, [[0]]> annotate_for_users : tensor<2xf32>
+  %sharding_annotated_1 = mesh.shard %sharding_annotated_0 to <@mesh_1d, [[]]> : tensor<2xf32>
+  // CHECK: return %[[ALL_GATHER]] : tensor<2xf32>
+  return %sharding_annotated_1 : tensor<2xf32>
+}
+
+
 // CHECK-LABEL: func @move_split_axis
 func.func @move_split_axis(
   // CHECK-SAME: %[[ARG:.*]]: tensor<1x2xi8>
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index db016fe8e7ba..115d164b6cc7 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -648,7 +648,6 @@ func.func @foo(%lb : index, %ub : index, %step : index) {
   omp.wsloop reduction(@foo %0 -> %prv : !llvm.ptr) {
     omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
       %2 = arith.constant 2.0 : f32
-      omp.reduction %2, %1 : f32, !llvm.ptr
       omp.yield
     }
     omp.terminator
@@ -678,7 +677,6 @@ func.func @foo(%lb : index, %ub : index, %step : index) {
   omp.wsloop reduction(@add_f32 %0 -> %prv : !llvm.ptr, @add_f32 %0 -> %prv1 : !llvm.ptr) {
     omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
       %2 = arith.constant 2.0 : f32
-      omp.reduction %2, %0 : f32, !llvm.ptr
       omp.yield
     }
     omp.terminator
@@ -713,7 +711,6 @@ func.func @foo(%lb : index, %ub : index, %step : index, %mem : memref<1xf32>) {
   omp.wsloop reduction(@add_f32 %mem -> %prv : memref<1xf32>) {
     omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
       %2 = arith.constant 2.0 : f32
-      omp.reduction %2, %mem : f32, memref<1xf32>
       omp.yield
     }
     omp.terminator
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 0d5fd9383a92..caf25a3cb59f 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -1003,8 +1003,6 @@ func.func @omp_teams(%lb : i32, %ub : i32, %if_cond : i1, %num_threads : i32,
   // CHECK: omp.teams reduction(@add_f32 -> %{{.+}} : !llvm.ptr) {
   omp.teams reduction(@add_f32 -> %0 : !llvm.ptr) {
     %1 = arith.constant 2.0 : f32
-    // CHECK: omp.reduction %{{.+}}, %{{.+}}
-    omp.reduction %1, %0 : f32, !llvm.ptr
     // CHECK: omp.terminator
     omp.terminator
   }
@@ -1028,15 +1026,11 @@ func.func @sections_reduction() {
     // CHECK: omp.section
     omp.section {
       %1 = arith.constant 2.0 : f32
-      // CHECK: omp.reduction %{{.+}}, %{{.+}}
-      omp.reduction %1, %0 : f32, !llvm.ptr
       omp.terminator
     }
     // CHECK: omp.section
     omp.section {
       %1 = arith.constant 3.0 : f32
-      // CHECK: omp.reduction %{{.+}}, %{{.+}}
-      omp.reduction %1, %0 : f32, !llvm.ptr
       omp.terminator
     }
     omp.terminator
@@ -1130,14 +1124,10 @@ func.func @sections_reduction2() {
   omp.sections reduction(@add2_f32 -> %0 : memref<1xf32>) {
     omp.section {
       %1 = arith.constant 2.0 : f32
-      // CHECK: omp.reduction
-      omp.reduction %1, %0 : f32, memref<1xf32>
       omp.terminator
     }
     omp.section {
       %1 = arith.constant 2.0 : f32
-      // CHECK: omp.reduction
-      omp.reduction %1, %0 : f32, memref<1xf32>
       omp.terminator
     }
     omp.terminator
diff --git a/mlir/test/Dialect/Polynomial/canonicalization.mlir b/mlir/test/Dialect/Polynomial/canonicalization.mlir
index dbfbf2d93f11..489d9ec2720d 100644
--- a/mlir/test/Dialect/Polynomial/canonicalization.mlir
+++ b/mlir/test/Dialect/Polynomial/canonicalization.mlir
@@ -43,3 +43,60 @@ func.func @test_canonicalize_sub(%poly0 : !sub_ty, %poly1 : !sub_ty) -> !sub_ty
   // CHECK: [[ADD:%.+]] = polynomial.add %[[p0]], %[[p1neg]]
   return %0 : !sub_ty
 }
+
+// CHECK-LABEL: test_canonicalize_fold_add_through_ntt
+// CHECK: polynomial.add
+// CHECK-NOT: polynomial.ntt
+// CHECK-NOT: polynomial.intt
+func.func @test_canonicalize_fold_add_through_ntt(
+    %poly0 : !ntt_poly_ty,
+    %poly1 : !ntt_poly_ty) -> !ntt_poly_ty {
+  %0 = polynomial.ntt %poly0 : !ntt_poly_ty -> !tensor_ty
+  %1 = polynomial.ntt %poly1 : !ntt_poly_ty -> !tensor_ty
+  %a_plus_b = arith.addi %0, %1 : !tensor_ty
+  %out = polynomial.intt %a_plus_b : !tensor_ty -> !ntt_poly_ty
+  return %out : !ntt_poly_ty
+}
+
+// CHECK-LABEL: test_canonicalize_fold_add_through_intt
+// CHECK: arith.addi
+// CHECK-NOT: polynomial.intt
+// CHECK-NOT: polynomial.iintt
+func.func @test_canonicalize_fold_add_through_intt(
+    %tensor0 : !tensor_ty,
+    %tensor1 : !tensor_ty) -> !tensor_ty {
+  %0 = polynomial.intt %tensor0 : !tensor_ty -> !ntt_poly_ty
+  %1 = polynomial.intt %tensor1 : !tensor_ty -> !ntt_poly_ty
+  %a_plus_b = polynomial.add %0, %1 : !ntt_poly_ty
+  %out = polynomial.ntt %a_plus_b : !ntt_poly_ty -> !tensor_ty
+  return %out : !tensor_ty
+}
+
+// CHECK-LABEL: test_canonicalize_fold_sub_through_ntt
+// CHECK: polynomial.mul_scalar
+// CHECK: polynomial.add
+// CHECK-NOT: polynomial.ntt
+// CHECK-NOT: polynomial.intt
+func.func @test_canonicalize_fold_sub_through_ntt(
+    %poly0 : !ntt_poly_ty,
+    %poly1 : !ntt_poly_ty) -> !ntt_poly_ty {
+  %0 = polynomial.ntt %poly0 : !ntt_poly_ty -> !tensor_ty
+  %1 = polynomial.ntt %poly1 : !ntt_poly_ty -> !tensor_ty
+  %a_plus_b = arith.subi %0, %1 : !tensor_ty
+  %out = polynomial.intt %a_plus_b : !tensor_ty -> !ntt_poly_ty
+  return %out : !ntt_poly_ty
+}
+
+// CHECK-LABEL: test_canonicalize_fold_sub_through_intt
+// CHECK: arith.subi
+// CHECK-NOT: polynomial.intt
+// CHECK-NOT: polynomial.iintt
+func.func @test_canonicalize_fold_sub_through_intt(
+    %tensor0 : !tensor_ty,
+    %tensor1 : !tensor_ty) -> !tensor_ty {
+  %0 = polynomial.intt %tensor0 : !tensor_ty -> !ntt_poly_ty
+  %1 = polynomial.intt %tensor1 : !tensor_ty -> !ntt_poly_ty
+  %a_plus_b = polynomial.sub %0, %1 : !ntt_poly_ty
+  %out = polynomial.ntt %a_plus_b : !ntt_poly_ty -> !tensor_ty
+  return %out : !tensor_ty
+}
diff --git a/mlir/test/Dialect/Polynomial/ops.mlir b/mlir/test/Dialect/Polynomial/ops.mlir
index ff709960c50e..4716e37ff885 100644
--- a/mlir/test/Dialect/Polynomial/ops.mlir
+++ b/mlir/test/Dialect/Polynomial/ops.mlir
@@ -74,15 +74,19 @@ module {
 
   func.func @test_monic_monomial_mul() {
     %five = arith.constant 5 : index
-    %0 = polynomial.constant {value=#one_plus_x_squared} : !polynomial.polynomial<ring=#ring1>
+    %0 = polynomial.constant int<1 + x**2> : !polynomial.polynomial<ring=#ring1>
     %1 = polynomial.monic_monomial_mul %0, %five : (!polynomial.polynomial<ring=#ring1>, index) -> !polynomial.polynomial<ring=#ring1>
     return
   }
 
   func.func @test_constant() {
-    %0 = polynomial.constant {value=#one_plus_x_squared} : !polynomial.polynomial<ring=#ring1>
-    %1 = polynomial.constant {value=#polynomial.int_polynomial<1 + x**2>} : !polynomial.polynomial<ring=#ring1>
-    %2 = polynomial.constant {value=#polynomial.float_polynomial<1.5 + 0.5 x**2>} : !polynomial.polynomial<ring=#ring2>
+    %0 = polynomial.constant int<1 + x**2> : !polynomial.polynomial<ring=#ring1>
+    %1 = polynomial.constant int<1 + x**2> : !polynomial.polynomial<ring=#ring1>
+    %2 = polynomial.constant float<1.5 + 0.5 x**2> : !polynomial.polynomial<ring=#ring2>
+
+    // Test verbose fallbacks
+    %verb0 = polynomial.constant #polynomial.typed_int_polynomial<1 + x**2> : !polynomial.polynomial<ring=#ring1>
+    %verb2 = polynomial.constant #polynomial.typed_float_polynomial<1.5 + 0.5 x**2> : !polynomial.polynomial<ring=#ring2>
     return
   }
 
diff --git a/mlir/test/Dialect/SCF/transform-ops.mlir b/mlir/test/Dialect/SCF/transform-ops.mlir
index f4b0db7fb1f9..a4daa86583c3 100644
--- a/mlir/test/Dialect/SCF/transform-ops.mlir
+++ b/mlir/test/Dialect/SCF/transform-ops.mlir
@@ -6,11 +6,11 @@
 // CHECK:   scf.for
 // CHECK:     arith.addi
 //
-// CHECK: func @foo[[SUFFIX:.+]](%{{.+}}, %{{.+}}, %{{.+}})
+// CHECK: func @foo[[$SUFFIX:.+]](%{{.+}}, %{{.+}}, %{{.+}})
 // CHECK:   scf.for
 // CHECK:     arith.addi
 //
-// CHECK-LABEL @loop_outline_op
+// CHECK-LABEL: @loop_outline_op
 func.func @loop_outline_op(%arg0: index, %arg1: index, %arg2: index) {
   // CHECK: scf.for
   // CHECK-NOT: scf.for
@@ -23,7 +23,7 @@ func.func @loop_outline_op(%arg0: index, %arg1: index, %arg2: index) {
   }
   // CHECK: scf.execute_region
   // CHECK-NOT: scf.for
-  // CHECK:   func.call @foo[[SUFFIX]]
+  // CHECK:   func.call @foo[[$SUFFIX]]
   scf.for %j = %arg0 to %arg1 step %arg2 {
     arith.addi %j, %j : index
   }
diff --git a/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir b/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir
index 7dc0bd99f54b..5c24f0e6a7d3 100644
--- a/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir
@@ -180,7 +180,7 @@ func.func @logicalUnary(%arg0 : i32)
 func.func @select_op_bool(%arg0: i1) -> () {
   %0 = spirv.Constant true
   %1 = spirv.Constant false
-  // CHECK : spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, i1
+  // CHECK: spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, i1
   %2 = spirv.Select %arg0, %0, %1 : i1, i1
   return
 }
@@ -188,7 +188,7 @@ func.func @select_op_bool(%arg0: i1) -> () {
 func.func @select_op_int(%arg0: i1) -> () {
   %0 = spirv.Constant 2 : i32
   %1 = spirv.Constant 3 : i32
-  // CHECK : spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, i32
+  // CHECK: spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, i32
   %2 = spirv.Select %arg0, %0, %1 : i1, i32
   return
 }
@@ -196,7 +196,7 @@ func.func @select_op_int(%arg0: i1) -> () {
 func.func @select_op_float(%arg0: i1) -> () {
   %0 = spirv.Constant 2.0 : f32
   %1 = spirv.Constant 3.0 : f32
-  // CHECK : spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, f32
+  // CHECK: spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, f32
   %2 = spirv.Select %arg0, %0, %1 : i1, f32
   return
 }
@@ -204,7 +204,7 @@ func.func @select_op_float(%arg0: i1) -> () {
 func.func @select_op_ptr(%arg0: i1) -> () {
   %0 = spirv.Variable : !spirv.ptr<f32, Function>
   %1 = spirv.Variable : !spirv.ptr<f32, Function>
-  // CHECK : spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, !spirv.ptr<f32, Function>
+  // CHECK: spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, !spirv.ptr<f32, Function>
   %2 = spirv.Select %arg0, %0, %1 : i1, !spirv.ptr<f32, Function>
   return
 }
@@ -212,7 +212,7 @@ func.func @select_op_ptr(%arg0: i1) -> () {
 func.func @select_op_vec(%arg0: i1) -> () {
   %0 = spirv.Constant dense<[2.0, 3.0, 4.0]> : vector<3xf32>
   %1 = spirv.Constant dense<[5.0, 6.0, 7.0]> : vector<3xf32>
-  // CHECK : spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, vector<3xf32>
+  // CHECK: spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : i1, vector<3xf32>
   %2 = spirv.Select %arg0, %0, %1 : i1, vector<3xf32>
   return
 }
@@ -220,7 +220,7 @@ func.func @select_op_vec(%arg0: i1) -> () {
 func.func @select_op_vec_condn_vec(%arg0: vector<3xi1>) -> () {
   %0 = spirv.Constant dense<[2.0, 3.0, 4.0]> : vector<3xf32>
   %1 = spirv.Constant dense<[5.0, 6.0, 7.0]> : vector<3xf32>
-  // CHECK : spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : vector<3xi1>, vector<3xf32>
+  // CHECK: spirv.Select {{%.*}}, {{%.*}}, {{%.*}} : vector<3xi1>, vector<3xf32>
   %2 = spirv.Select %arg0, %0, %1 : vector<3xi1>, vector<3xf32>
   return
 }
diff --git a/mlir/test/Dialect/SPIRV/IR/structure-ops.mlir b/mlir/test/Dialect/SPIRV/IR/structure-ops.mlir
index db0f52dcc40e..1eed5892a085 100644
--- a/mlir/test/Dialect/SPIRV/IR/structure-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/structure-ops.mlir
@@ -330,7 +330,7 @@ spirv.module Logical GLSL450 {
 // TODO: Fix test case after initialization with normal constant is addressed
 // spirv.module Logical GLSL450 {
 //   %0 = spirv.Constant 4.0 : f32
-//   // CHECK1: spirv.Variable init(%0) : !spirv.ptr<f32, Private>
+//   COM: CHECK: spirv.Variable init(%0) : !spirv.ptr<f32, Private>
 //   spirv.GlobalVariable @var1 init(%0) : !spirv.ptr<f32, Private>
 // }
 
@@ -372,7 +372,7 @@ spirv.module Logical GLSL450 {
 // TODO: Fix test case after initialization with constant is addressed
 // spirv.module Logical GLSL450 {
 //   %0 = spirv.Constant 4.0 : f32
-//   // CHECK1: spirv.GlobalVariable @var1 initializer(%0) {binding = 5 : i32} : !spirv.ptr<f32, Private>
+//   COM: CHECK: spirv.GlobalVariable @var1 initializer(%0) {binding = 5 : i32} : !spirv.ptr<f32, Private>
 //   spirv.GlobalVariable @var1 initializer(%0) {binding = 5 : i32} : !spirv.ptr<f32, Private>
 // }
 
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index 914e5e8b8c4b..f7fbd3834288 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -2523,4 +2523,3 @@ func.func @dim_out_of_bounds() -> vector<7xi32> {
     %16 = affine.vector_load %alloc_21[%c1, %c1, %dim] : memref<?x26x2xi32>, vector<7xi32>
     return %16 : vector<7xi32>
 }
-
diff --git a/mlir/test/Dialect/Tensor/fold-empty-op.mlir b/mlir/test/Dialect/Tensor/fold-empty-op.mlir
index e200a4f89261..e94f6ec7ec56 100644
--- a/mlir/test/Dialect/Tensor/fold-empty-op.mlir
+++ b/mlir/test/Dialect/Tensor/fold-empty-op.mlir
@@ -64,6 +64,79 @@ func.func @rank_reducing_empty_tensor_extract(%sz : index, %idx : index) -> tens
   return %r: tensor<2xf32>
 }
 
+func.func @pack_empty(%arg0: tensor<8x8x32x32xf32>) -> tensor<8x8x32x32xf32> {
+  %empty_unpacked = tensor.empty() : tensor<256x256xf32>
+  %packed = tensor.pack %empty_unpacked
+    inner_dims_pos = [0, 1] inner_tiles = [32, 32]
+    into %arg0 : tensor<256x256xf32> -> tensor<8x8x32x32xf32>
+  return %packed : tensor<8x8x32x32xf32>
+}
+
+// CHECK-LABEL: func.func @pack_empty(
+// CHECK-SAME:   %[[T:.+]]: tensor<8x8x32x32xf32>
+// CHECK-NOT:    tensor.pack
+// CHECK:        return %[[T]] : tensor<8x8x32x32xf32>
+
+func.func @pack_empty_dynamic(%arg0: tensor<?x?x?x?xf32>, %dim0: index, %dim1: index) -> tensor<?x?x?x?xf32> {
+  %empty_unpacked = tensor.empty(%dim0, %dim1) : tensor<?x?xf32>
+  %packed = tensor.pack %empty_unpacked
+    inner_dims_pos = [0, 1] inner_tiles = [32, 32]
+    into %arg0 : tensor<?x?xf32> -> tensor<?x?x?x?xf32>
+  return %packed : tensor<?x?x?x?xf32>
+}
+
+// CHECK-LABEL: func.func @pack_empty_dynamic(
+// CHECK-SAME:   %[[T:.+]]: tensor<?x?x?x?xf32>,
+// CHECK-SAME:   %[[DIM0:[a-zA-Z0-9_]+]]: index,
+// CHECK-SAME:   %[[DIM1:[a-zA-Z0-9_]+]]: index
+// CHECK-NOT:    tensor.pack
+// CHECK:        return %[[T]] : tensor<?x?x?x?xf32>
+
+func.func @unpack_empty(%arg0: tensor<256x256xf32>) -> tensor<256x256xf32> {
+  %empty_packed = tensor.empty() : tensor<8x8x32x32xf32>
+  %unpacked = tensor.unpack %empty_packed
+    inner_dims_pos = [0, 1] inner_tiles = [32, 32]
+    into %arg0 : tensor<8x8x32x32xf32> -> tensor<256x256xf32>
+  return %unpacked : tensor<256x256xf32>
+}
+
+// CHECK-LABEL: func.func @unpack_empty(
+// CHECK-SAME:   %[[T:.+]]: tensor<256x256xf32>
+// CHECK-NOT:    tensor.unpack
+// CHECK:        return %[[T]] : tensor<256x256xf32>
+
+func.func @unpack_empty_dynamic(%arg0: tensor<?x?xf32>, %dim0: index, %dim1: index, %dim2: index, %dim3: index) -> tensor<?x?xf32> {
+  %empty_packed = tensor.empty(%dim0, %dim1, %dim2, %dim3) : tensor<?x?x?x?xf32>
+  %unpacked = tensor.unpack %empty_packed
+    inner_dims_pos = [0, 1] inner_tiles = [32, 32]
+    into %arg0 : tensor<?x?x?x?xf32> -> tensor<?x?xf32>
+  return %unpacked : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: func.func @unpack_empty_dynamic(
+// CHECK-SAME:   %[[T:.+]]: tensor<?x?xf32>,
+// CHECK-SAME:   %[[DIM0:[a-zA-Z0-9_]+]]: index,
+// CHECK-SAME:   %[[DIM1:[a-zA-Z0-9_]+]]: index,
+// CHECK-SAME:   %[[DIM2:[a-zA-Z0-9_]+]]: index,
+// CHECK-SAME:   %[[DIM3:[a-zA-Z0-9_]+]]: index
+// CHECK-NOT:    tensor.unpack
+// CHECK:        return %[[T]] : tensor<?x?xf32>
+
+func.func @pack_padded_empty(%arg0: tensor<8x8x32x32xf32>) -> tensor<8x8x32x32xf32> {
+  %pad = arith.constant 1.0 : f32
+  %empty_unpacked = tensor.empty() : tensor<256x256xf32>
+  %packed = tensor.pack %empty_unpacked
+    padding_value(%pad : f32)
+    inner_dims_pos = [0, 1] inner_tiles = [32, 32]
+    into %arg0 : tensor<256x256xf32> -> tensor<8x8x32x32xf32>
+  return %packed : tensor<8x8x32x32xf32>
+}
+
+// CHECK-LABEL: func.func @pack_padded_empty(
+// CHECK-SAME:   %[[T:.+]]: tensor<8x8x32x32xf32>
+// CHECK:        %[[PACK:.+]] = tensor.pack
+// CHECK:        return %[[PACK]] : tensor<8x8x32x32xf32>
+
 // -----
 
 module attributes {transform.with_named_sequence} {
diff --git a/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir b/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir
index 9f486f9146ad..9a3143f5e550 100644
--- a/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir
+++ b/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir
@@ -544,7 +544,7 @@ func.func @linalg_transpose_tensor_unpack_fold(%arg0: tensor<1x1x4x16xi32>) -> t
 // CHECK-SAME:        outer_dims_perm = [1, 0]
 // CHECK-SAME:        inner_dims_pos = [1, 0]
 // CHECK-SAME:        inner_tiles = [4, 16]
-// CHEKC-SAME:        into %[[OUT]] : tensor<1x1x4x16xi32> -> tensor<16x4xi32>
+// CHECK-SAME:        into %[[OUT]] : tensor<1x1x4x16xi32> -> tensor<16x4xi32>
 //      CHECK:     return %[[UNPACK]] : tensor<16x4xi32>
 //      CHECK:   }
 
diff --git a/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir b/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir
index d3ac6ce792f3..644d9a918f6c 100644
--- a/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir
+++ b/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir
@@ -54,3 +54,105 @@ func.func @rank_reducing_parallel_insert_of_collapse_shape(
   }
   return %1 : tensor<?x?x?x?xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func @insert_of_padding_expand_shape(
+//  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>
+//  CHECK-SAME:     %[[d:.*]]: tensor<?x?x?x?xf32>
+//  CHECK-SAME:     %[[x:[a-zA-Z0-9_]+]]: index
+//  CHECK-SAME:     %[[y:[a-zA-Z0-9_]+]]: index
+//       CHECK:   %[[insert:.*]] = tensor.insert_slice %[[t]] into %[[d]][%[[x]], %[[y]], 0, 0] [1, %{{.*}}, 1, %{{.*}}] [1, 1, 1, 1] : tensor<?x?xf32> into tensor<?x?x?x?xf32>
+//       CHECK:   return %[[insert]]
+func.func @insert_of_padding_expand_shape(
+    %t: tensor<?x?xf32>, %d: tensor<?x?x?x?xf32>, %x: index, %y: index)
+  -> tensor<?x?x?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %sz0 = tensor.dim %t, %c0 : tensor<?x?xf32>
+  %sz1 = tensor.dim %t, %c1 : tensor<?x?xf32>
+  %0 = tensor.expand_shape %t [[0, 1], [2, 3]] output_shape [1, %sz0, 1, %sz1]
+      : tensor<?x?xf32> into tensor<1x?x1x?xf32>
+  %1 = tensor.insert_slice %0 into %d[%x, %y, 0, 0][1, %sz0, 1, %sz1][1, 1, 1, 1]
+      : tensor<1x?x1x?xf32> into tensor<?x?x?x?xf32>
+  return %1 : tensor<?x?x?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @insert_of_non_padding_expand_shape(
+//  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>
+//  CHECK-SAME:     %[[d:.*]]: tensor<?x?x?x?xf32>
+//  CHECK-SAME:     %[[x:[a-zA-Z0-9_]+]]: index
+//  CHECK-SAME:     %[[y:[a-zA-Z0-9_]+]]: index
+//  CHECK-SAME:     %[[sz:[a-zA-Z0-9_]+]]: index
+//       CHECK:   %[[expand:.*]] = tensor.expand_shape %[[t]] {{\[}}[0, 1], [2]] output_shape [%[[sz]], %{{.*}}, %{{.*}}] : tensor<?x?xf32> into tensor<?x?x?xf32>
+//       CHECK:   %[[insert:.*]] = tensor.insert_slice %[[expand]] into %[[d]][%[[x]], %[[y]], 0, 0] [%[[sz]], 1, %{{.*}}, %{{.*}}] [1, 1, 1, 1] : tensor<?x?x?xf32> into tensor<?x?x?x?xf32>
+//       CHECK:   return %[[insert]]
+func.func @insert_of_non_padding_expand_shape(
+    %t: tensor<?x?xf32>, %d: tensor<?x?x?x?xf32>, %x: index, %y: index, %sz: index)
+  -> tensor<?x?x?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %sz0 = tensor.dim %t, %c0 : tensor<?x?xf32>
+  %sz1 = tensor.dim %t, %c1 : tensor<?x?xf32>
+  %0 = tensor.expand_shape %t [[0, 1], [2]] output_shape [%sz, %sz0, %sz1]
+      : tensor<?x?xf32> into tensor<?x?x?xf32>
+  %1 = tensor.insert_slice %0 into %d[%x, %y, 0, 0][%sz, 1, %sz0, %sz1][1, 1, 1, 1]
+      : tensor<?x?x?xf32> into tensor<?x?x?x?xf32>
+  return %1 : tensor<?x?x?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @parallel_insert_of_padding_expand_shape(
+//  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>
+//  CHECK-SAME:     %[[d:.*]]: tensor<?x?x?x?xf32>
+//  CHECK-SAME:     %[[x:[a-zA-Z0-9_]+]]: index
+//  CHECK-SAME:     %[[y:[a-zA-Z0-9_]+]]: index
+//       CHECK:   tensor.parallel_insert_slice %[[t]] into %{{.*}}[%{{.*}}, %{{.*}}, 0, 0] [1, %{{.*}}, 1, %{{.*}}] [1, 1, 1, 1] : tensor<?x?xf32> into tensor<?x?x?x?xf32>
+func.func @parallel_insert_of_padding_expand_shape(
+    %t: tensor<?x?xf32>, %d: tensor<?x?x?x?xf32>, %x: index, %y: index)
+  -> tensor<?x?x?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %sz0 = tensor.dim %t, %c0 : tensor<?x?xf32>
+  %sz1 = tensor.dim %t, %c1 : tensor<?x?xf32>
+  %0 = tensor.expand_shape %t [[0, 1], [2, 3]] output_shape [1, %sz0, 1, %sz1]
+      : tensor<?x?xf32> into tensor<1x?x1x?xf32>
+  %1 = scf.forall (%i, %j) in (%x, %y) shared_outs(%o = %d) -> (tensor<?x?x?x?xf32>) {
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %0 into %o[%i, %j, 0, 0][1, %sz0, 1, %sz1][1, 1, 1, 1]
+          : tensor<1x?x1x?xf32> into tensor<?x?x?x?xf32>
+    }
+  }
+  return %1 : tensor<?x?x?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @parallel_insert_of_non_padding_expand_shape(
+//  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>
+//  CHECK-SAME:     %[[d:.*]]: tensor<?x?x?x?xf32>
+//  CHECK-SAME:     %[[x:[a-zA-Z0-9_]+]]: index
+//  CHECK-SAME:     %[[y:[a-zA-Z0-9_]+]]: index
+//  CHECK-SAME:     %[[sz:[a-zA-Z0-9_]+]]: index
+//       CHECK:   %[[expand:.*]] = tensor.expand_shape %[[t]] {{\[}}[0, 1], [2]] output_shape [%[[sz]], %{{.*}}, %{{.*}}] : tensor<?x?xf32> into tensor<?x?x?xf32>
+//       CHECK:   tensor.parallel_insert_slice %[[expand]] into %{{.*}}[%{{.*}}, %{{.*}}, 0, 0] [%[[sz]], 1, %{{.*}}, %{{.*}}] [1, 1, 1, 1] : tensor<?x?x?xf32> into tensor<?x?x?x?xf32>
+func.func @parallel_insert_of_non_padding_expand_shape(
+    %t: tensor<?x?xf32>, %d: tensor<?x?x?x?xf32>, %x: index, %y: index, %sz: index)
+  -> tensor<?x?x?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %sz0 = tensor.dim %t, %c0 : tensor<?x?xf32>
+  %sz1 = tensor.dim %t, %c1 : tensor<?x?xf32>
+  %0 = tensor.expand_shape %t [[0, 1], [2]] output_shape [%sz, %sz0, %sz1]
+      : tensor<?x?xf32> into tensor<?x?x?xf32>
+  %1 = scf.forall (%i, %j) in (%x, %y) shared_outs(%o = %d) -> (tensor<?x?x?x?xf32>) {
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %0 into %o[%i, %j, 0, 0][%sz, 1, %sz0, %sz1][1, 1, 1, 1]
+          : tensor<?x?x?xf32> into tensor<?x?x?x?xf32>
+    }
+  }
+  return %1 : tensor<?x?x?x?xf32>
+}
diff --git a/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir b/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir
index 5a2eade0eccc..f9e51ae52a74 100644
--- a/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir
+++ b/mlir/test/Dialect/Tensor/simplify-pack-unpack.mlir
@@ -266,3 +266,131 @@ func.func @unpack_16x1x1x2_to_32x1(%arg0 : tensor<16x1x1x2xf32>) -> tensor<32x1x
     : tensor<16x1x1x2xf32> -> tensor<32x1xf32>
   return %unpack : tensor<32x1xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func.func @pad_like_pack(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<32x64xf32>)
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1, 2], [3]] output_shape [1, 1, 32, 64] : tensor<32x64xf32> into tensor<1x1x32x64xf32>
+// CHECK:         return %[[EXPANDED]] : tensor<1x1x32x64xf32>
+func.func @pad_like_pack(%arg0: tensor<32x64xf32>) -> tensor<1x1x32x64xf32> {
+  %empty = tensor.empty() : tensor<1x1x32x64xf32>
+  %0 = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<32x64xf32> -> tensor<1x1x32x64xf32>
+  return %0 : tensor<1x1x32x64xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @pad_like_pack_with_outer_dims_perm(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<32x64xf32>)
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1, 2], [3]] output_shape [1, 1, 32, 64] : tensor<32x64xf32> into tensor<1x1x32x64xf32>
+// CHECK:         return %[[EXPANDED]] : tensor<1x1x32x64xf32>
+func.func @pad_like_pack_with_outer_dims_perm(%arg0: tensor<32x64xf32>) -> tensor<1x1x32x64xf32> {
+  %empty = tensor.empty() : tensor<1x1x32x64xf32>
+  %0 = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<32x64xf32> -> tensor<1x1x32x64xf32>
+  return %0 : tensor<1x1x32x64xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @inner_pad_like_pack(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<32x64xf32>)
+// CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0], [1, 2]] output_shape [32, 1, 64] : tensor<32x64xf32> into tensor<32x1x64xf32>
+// CHECK:         return %[[EXPANDED]] : tensor<32x1x64xf32>
+func.func @inner_pad_like_pack(%arg0: tensor<32x64xf32>) -> tensor<32x1x64xf32> {
+  %empty = tensor.empty() : tensor<32x1x64xf32>
+  %0 = tensor.pack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x64xf32> -> tensor<32x1x64xf32>
+  return %0 : tensor<32x1x64xf32>
+}
+
+// -----
+
+// Do not simplify pack with inner dimension shuffling.
+// CHECK-LABEL: func.func @pad_and_inner_dim_shuffle_pack(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<32x64xf32>)
+// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<1x1x64x32xf32>
+// CHECK:         %[[PACK:.+]] = tensor.pack %[[ARG0]] inner_dims_pos = [1, 0] inner_tiles = [64, 32] into %[[EMPTY]] : tensor<32x64xf32> -> tensor<1x1x64x32xf32>
+// CHECK:         return %[[PACK]] : tensor<1x1x64x32xf32>
+func.func @pad_and_inner_dim_shuffle_pack(%arg0: tensor<32x64xf32>) -> tensor<1x1x64x32xf32> {
+  %empty = tensor.empty() : tensor<1x1x64x32xf32>
+  %0 = tensor.pack %arg0 inner_dims_pos = [1, 0] inner_tiles = [64, 32] into %empty : tensor<32x64xf32> -> tensor<1x1x64x32xf32>
+  return %0 : tensor<1x1x64x32xf32>
+}
+
+// -----
+
+// Do not simplify pack with inner dimension transpose.
+// CHECK-LABEL: func.func @pad_like_pack_with_transpose(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<32x64x16xf32>)
+// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<32x1x16x64xf32>
+// CHECK:         %[[PACK:.+]] = tensor.pack %[[ARG0]] inner_dims_pos = [1] inner_tiles = [64] into %[[EMPTY]] : tensor<32x64x16xf32> -> tensor<32x1x16x64xf32>
+// CHECK:         return %[[PACK]] : tensor<32x1x16x64xf32>
+func.func @pad_like_pack_with_transpose(%arg0: tensor<32x64x16xf32>) -> tensor<32x1x16x64xf32> {
+  %empty = tensor.empty() : tensor<32x1x16x64xf32>
+  %0 = tensor.pack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x64x16xf32> -> tensor<32x1x16x64xf32>
+  return %0 : tensor<32x1x16x64xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @unpad_like_unpack(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<1x1x32x64xf32>)
+// CHECK:         %[[COLLAPSED:.+]] = tensor.collapse_shape %[[ARG0]] {{\[}}[0, 1, 2], [3]] : tensor<1x1x32x64xf32> into tensor<32x64xf32>
+// CHECK:         return %[[COLLAPSED]] : tensor<32x64xf32>
+func.func @unpad_like_unpack(%arg0: tensor<1x1x32x64xf32>) -> tensor<32x64xf32> {
+  %empty = tensor.empty() : tensor<32x64xf32>
+  %0 = tensor.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<1x1x32x64xf32> -> tensor<32x64xf32>
+  return %0 : tensor<32x64xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @unpad_like_unpack_with_outer_dims_perm(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<1x1x32x64xf32>)
+// CHECK:         %[[COLLAPSED:.+]] = tensor.collapse_shape %[[ARG0]] {{\[}}[0, 1, 2], [3]] : tensor<1x1x32x64xf32> into tensor<32x64xf32>
+// CHECK:         return %[[COLLAPSED]] : tensor<32x64xf32>
+func.func @unpad_like_unpack_with_outer_dims_perm(%arg0: tensor<1x1x32x64xf32>) -> tensor<32x64xf32> {
+  %empty = tensor.empty() : tensor<32x64xf32>
+  %0 = tensor.unpack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 64] into %empty : tensor<1x1x32x64xf32> -> tensor<32x64xf32>
+  return %0 : tensor<32x64xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func.func @inner_unpad_like_unpack(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<32x1x64xf32>)
+// CHECK:         %[[COLLAPSED:.+]] = tensor.collapse_shape %[[ARG0]] {{\[}}[0], [1, 2]] : tensor<32x1x64xf32> into tensor<32x64xf32>
+// CHECK:         return %[[COLLAPSED]] : tensor<32x64xf32>
+func.func @inner_unpad_like_unpack(%arg0: tensor<32x1x64xf32>) -> tensor<32x64xf32> {
+  %empty = tensor.empty() : tensor<32x64xf32>
+  %0 = tensor.unpack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x1x64xf32> -> tensor<32x64xf32>
+  return %0 : tensor<32x64xf32>
+}
+
+// -----
+
+// Do not simplify unpack with inner dimension shuffling.
+// CHECK-LABEL: func.func @unpad_and_inner_dim_shuffle_pack(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<1x1x32x64xf32>)
+// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<64x32xf32>
+// CHECK:         %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] inner_dims_pos = [1, 0] inner_tiles = [32, 64] into %[[EMPTY]] : tensor<1x1x32x64xf32> -> tensor<64x32xf32>
+// CHECK:         return %[[UNPACK]] : tensor<64x32xf32>
+func.func @unpad_and_inner_dim_shuffle_pack(%arg0: tensor<1x1x32x64xf32>) -> tensor<64x32xf32> {
+  %empty = tensor.empty() : tensor<64x32xf32>
+  %0 = tensor.unpack %arg0 inner_dims_pos = [1, 0] inner_tiles = [32, 64] into %empty : tensor<1x1x32x64xf32> -> tensor<64x32xf32>
+  return %0 : tensor<64x32xf32>
+}
+
+// -----
+
+// Do not simplify unpack with inner dimension transpose.
+// CHECK-LABEL: func.func @unpad_like_unpack_with_transpose(
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<32x1x16x64xf32>)
+// CHECK:         %[[EMPTY:.+]] = tensor.empty() : tensor<32x64x16xf32>
+// CHECK:         %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] inner_dims_pos = [1] inner_tiles = [64] into %[[EMPTY]] : tensor<32x1x16x64xf32> -> tensor<32x64x16xf32>
+// CHECK:         return %[[UNPACK]] : tensor<32x64x16xf32>
+func.func @unpad_like_unpack_with_transpose(%arg0: tensor<32x1x16x64xf32>) -> tensor<32x64x16xf32> {
+  %empty = tensor.empty() : tensor<32x64x16xf32>
+  %0 = tensor.unpack %arg0 inner_dims_pos = [1] inner_tiles = [64] into %empty : tensor<32x1x16x64xf32> -> tensor<32x64x16xf32>
+  return %0 : tensor<32x64x16xf32>
+}
diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
index c9f7e9c6e2fb..1516f51fe145 100644
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -1798,3 +1798,59 @@ func.func @invalid_outerproduct1(%src : memref<?xf32>) {
   // expected-error @+1 {{'vector.outerproduct' op expected 1-d vector for operand #1}}
   %op = vector.outerproduct %0, %1 : vector<[4]x[4]xf32>, vector<[4]xf32>
 }
+
+// -----
+
+func.func @deinterleave_zero_dim_fail(%vec : vector<f32>) {
+  // expected-error @+1 {{'vector.deinterleave' op operand #0 must be vector of any type values, but got 'vector<f32>}}
+  %0, %1 = vector.deinterleave %vec : vector<f32> -> vector<f32>
+  return
+}
+
+// -----
+
+func.func @deinterleave_one_dim_fail(%vec : vector<1xf32>) {
+  // expected-error @+1 {{'vector.deinterleave' op failed to verify that the trailing dimension of the source vector has an even number of elements}}
+  %0, %1 = vector.deinterleave %vec : vector<1xf32> -> vector<1xf32>
+  return
+}
+
+// -----
+
+func.func @deinterleave_oversized_output_fail(%vec : vector<4xf32>) {
+  // expected-error @+1 {{'vector.deinterleave' op failed to verify that the trailing dimension of the results is half the width of source trailing dimension}}
+  %0, %1 = "vector.deinterleave" (%vec) : (vector<4xf32>) -> (vector<8xf32>, vector<8xf32>)
+  return
+}
+
+// -----
+
+func.func @deinterleave_output_dim_size_mismatch(%vec : vector<4xf32>) {
+  // expected-error @+1 {{'vector.deinterleave' op failed to verify that the trailing dimension of the results is half the width of source trailing dimension}}
+  %0, %1 = "vector.deinterleave" (%vec) : (vector<4xf32>) -> (vector<4xf32>, vector<2xf32>)
+  return
+}
+
+// -----
+
+func.func @deinterleave_n_dim_rank_fail(%vec : vector<2x3x4xf32>) {
+  // expected-error @+1 {{'vector.deinterleave' op failed to verify that the trailing dimension of the results is half the width of source trailing dimension}}
+  %0, %1 = "vector.deinterleave" (%vec) : (vector<2x3x4xf32>) -> (vector<2x3x4xf32>, vector<2x3x2xf32>)
+  return
+}
+
+// -----
+
+func.func @deinterleave_scalable_dim_size_fail(%vec : vector<2x[4]xf32>) {
+  // expected-error @+1 {{'vector.deinterleave' op failed to verify that all of {res1, res2} have same type}}
+  %0, %1 = "vector.deinterleave" (%vec) : (vector<2x[4]xf32>) -> (vector<2x[2]xf32>, vector<2x[1]xf32>)
+  return
+}
+
+// -----
+
+func.func @deinterleave_scalable_rank_fail(%vec : vector<2x[4]xf32>) {
+  // expected-error @+1 {{'vector.deinterleave' op failed to verify that all of {res1, res2} have same type}}
+  %0, %1 = "vector.deinterleave" (%vec) : (vector<2x[4]xf32>) -> (vector<2x[2]xf32>, vector<[2]xf32>)
+  return
+}
diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir
index 79a80be4f8b2..9d8101d3eee9 100644
--- a/mlir/test/Dialect/Vector/ops.mlir
+++ b/mlir/test/Dialect/Vector/ops.mlir
@@ -1116,3 +1116,45 @@ func.func @interleave_2d_scalable(%a: vector<2x[2]xf64>, %b: vector<2x[2]xf64>)
   %0 = vector.interleave %a, %b : vector<2x[2]xf64>
   return %0 : vector<2x[4]xf64>
 }
+
+// CHECK-LABEL: @deinterleave_1d
+func.func @deinterleave_1d(%arg: vector<4xf32>) -> (vector<2xf32>, vector<2xf32>) {
+  // CHECK: vector.deinterleave %{{.*}} : vector<4xf32> -> vector<2xf32>
+  %0, %1 = vector.deinterleave %arg : vector<4xf32> -> vector<2xf32>
+  return %0, %1 : vector<2xf32>, vector<2xf32>
+}
+
+// CHECK-LABEL: @deinterleave_1d_scalable
+func.func @deinterleave_1d_scalable(%arg: vector<[4]xf32>) -> (vector<[2]xf32>, vector<[2]xf32>) {
+  // CHECK: vector.deinterleave %{{.*}} : vector<[4]xf32> -> vector<[2]xf32>
+  %0, %1 = vector.deinterleave %arg : vector<[4]xf32> -> vector<[2]xf32>
+  return %0, %1 : vector<[2]xf32>, vector<[2]xf32>
+}
+
+// CHECK-LABEL: @deinterleave_2d
+func.func @deinterleave_2d(%arg: vector<3x4xf32>) -> (vector<3x2xf32>, vector<3x2xf32>) {
+  // CHECK: vector.deinterleave %{{.*}} : vector<3x4xf32> -> vector<3x2xf32>
+  %0, %1 = vector.deinterleave %arg : vector<3x4xf32> -> vector<3x2xf32>
+  return %0, %1 : vector<3x2xf32>, vector<3x2xf32>
+}
+
+// CHECK-LABEL: @deinterleave_2d_scalable
+func.func @deinterleave_2d_scalable(%arg: vector<3x[4]xf32>) -> (vector<3x[2]xf32>, vector<3x[2]xf32>) {
+  // CHECK: vector.deinterleave %{{.*}} : vector<3x[4]xf32> -> vector<3x[2]xf32>
+  %0, %1 = vector.deinterleave %arg : vector<3x[4]xf32> -> vector<3x[2]xf32>
+  return %0, %1 : vector<3x[2]xf32>, vector<3x[2]xf32>
+}
+
+// CHECK-LABEL: @deinterleave_nd
+func.func @deinterleave_nd(%arg: vector<2x3x4x6xf32>) -> (vector<2x3x4x3xf32>, vector<2x3x4x3xf32>) {
+  // CHECK: vector.deinterleave %{{.*}} : vector<2x3x4x6xf32> -> vector<2x3x4x3xf32>
+  %0, %1 = vector.deinterleave %arg : vector<2x3x4x6xf32> -> vector<2x3x4x3xf32>
+  return %0, %1 : vector<2x3x4x3xf32>, vector<2x3x4x3xf32>
+}
+
+// CHECK-LABEL: @deinterleave_nd_scalable
+func.func @deinterleave_nd_scalable(%arg:vector<2x3x4x[6]xf32>) -> (vector<2x3x4x[3]xf32>, vector<2x3x4x[3]xf32>) {
+  // CHECK: vector.deinterleave %{{.*}} : vector<2x3x4x[6]xf32> -> vector<2x3x4x[3]xf32>
+  %0, %1 = vector.deinterleave %arg : vector<2x3x4x[6]xf32> -> vector<2x3x4x[3]xf32>
+  return %0, %1 : vector<2x3x4x[3]xf32>, vector<2x3x4x[3]xf32>
+}
diff --git a/mlir/test/IR/parser.mlir b/mlir/test/IR/parser.mlir
index 020942e7f4c1..bcc146ea0101 100644
--- a/mlir/test/IR/parser.mlir
+++ b/mlir/test/IR/parser.mlir
@@ -597,7 +597,7 @@ func.func @funcattrwithblock() -> ()
   return
 }
 
-// CHECK-label func @funcsimplemap
+// CHECK-LABEL: func @funcsimplemap
 #map_simple0 = affine_map<()[] -> (10)>
 #map_simple1 = affine_map<()[s0] -> (s0)>
 #map_non_simple0 = affine_map<(d0)[] -> (d0)>
diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
index 2e59b7234e53..391fda82e1e1 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir
@@ -6,15 +6,6 @@
 // RUN:   --entry-point-result=void \
 // RUN:  | FileCheck %s
 
-// Basic PTX check to make sure we are generating the right instructions.
-
-// CHECK-PTX: mbarrier.init.shared.b64
-// CHECK-PTX: mbarrier.arrive.expect_tx.shared.b64
-// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes
-// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes
-// CHECK-PTX: mbarrier.arrive.expect_tx.shared.b64
-// CHECK-PTX: mbarrier.try_wait.parity.shared.b64
-
 // RUN: mlir-opt %s --convert-nvgpu-to-nvvm \
 // RUN:         -gpu-kernel-outlining \
 // RUN:         -convert-nvvm-to-llvm \
diff --git a/mlir/test/Pass/ir-printing-file-tree.mlir b/mlir/test/Pass/ir-printing-file-tree.mlir
new file mode 100644
index 000000000000..b00d77db2c60
--- /dev/null
+++ b/mlir/test/Pass/ir-printing-file-tree.mlir
@@ -0,0 +1,41 @@
+// Test filtering by "before"
+// RUN: rm -rf %t || true
+// RUN: mlir-opt %s -mlir-print-ir-tree-dir=%t \
+// RUN:   -pass-pipeline='builtin.module(builtin.module(func.func(cse,canonicalize)))' \
+// RUN:   -mlir-print-ir-before=cse
+// RUN: test -f %t/builtin_module_outer/builtin_module_inner/func_func_symB/0_0_0_cse.mlir
+// RUN: test ! -f %t/builtin_module_outer/builtin_module_inner/func_func_symB/0_0_1_canonicalize.mlir
+// RUN: test -f %t/builtin_module_outer/builtin_module_inner/func_func_symC/0_0_0_cse.mlir
+// RUN: test ! -f %t/builtin_module_outer/builtin_module_inner/func_func_symC/0_0_1_canonicalize.mlir
+
+// Test printing after all and the counter mechanism.
+// RUN: rm -rf %t || true
+// RUN: mlir-opt %s -mlir-print-ir-tree-dir=%t \
+// RUN:   -pass-pipeline='builtin.module(canonicalize,canonicalize,func.func(cse),builtin.module(canonicalize,func.func(cse,canonicalize),cse),cse)' \
+// RUN:   -mlir-print-ir-after-all
+// RUN: test -f %t/builtin_module_outer/0_canonicalize.mlir
+// RUN: test -f %t/builtin_module_outer/1_canonicalize.mlir
+// RUN: test -f %t/builtin_module_outer/func_func_symA/1_0_cse.mlir
+// RUN: test -f %t/builtin_module_outer/builtin_module_inner/1_0_canonicalize.mlir
+// RUN: test -f %t/builtin_module_outer/builtin_module_inner/func_func_symB/1_0_0_cse.mlir
+// RUN: test -f %t/builtin_module_outer/builtin_module_inner/func_func_symB/1_0_1_canonicalize.mlir
+// RUN: test -f %t/builtin_module_outer/builtin_module_inner/func_func_symC/1_0_0_cse.mlir
+// RUN: test -f %t/builtin_module_outer/builtin_module_inner/func_func_symC/1_0_1_canonicalize.mlir
+// RUN: test -f %t/builtin_module_outer/builtin_module_inner/1_1_cse.mlir
+// RUN: test -f %t/builtin_module_outer/2_cse.mlir
+
+builtin.module @outer {
+
+  func.func @symA() {
+    return
+  }
+
+  builtin.module @inner {
+    func.func @symB() {
+      return
+    }
+    func.func @symC() {
+      return
+    }
+  }
+}
diff --git a/mlir/test/Target/LLVMIR/Import/global-variables.ll b/mlir/test/Target/LLVMIR/Import/global-variables.ll
index 9d9734045988..902f77bd7e6c 100644
--- a/mlir/test/Target/LLVMIR/Import/global-variables.ll
+++ b/mlir/test/Target/LLVMIR/Import/global-variables.ll
@@ -36,7 +36,7 @@
 ; CHECK-DAG:  %[[ADDR:[0-9]+]] = llvm.mlir.addressof @global_int : !llvm.ptr
 ; CHECK-DAG:  %[[IDX:[0-9]+]] = llvm.mlir.constant(2 : i32) : i32
 ; CHECK-DAG:  %[[GEP:[0-9]+]] = llvm.getelementptr %[[ADDR]][%[[IDX]]] : (!llvm.ptr, i32) -> !llvm.ptr
-; CHECK-DAG   llvm.return %[[GEP]] : !llvm.ptr
+; CHECK-DAG:  llvm.return %[[GEP]] : !llvm.ptr
 @global_gep_const_expr = internal constant ptr getelementptr (i32, ptr @global_int, i32 2)
 
 ; // -----
diff --git a/mlir/test/Target/LLVMIR/Import/metadata-loop.ll b/mlir/test/Target/LLVMIR/Import/metadata-loop.ll
index 3516101a2367..20431a7412bd 100644
--- a/mlir/test/Target/LLVMIR/Import/metadata-loop.ll
+++ b/mlir/test/Target/LLVMIR/Import/metadata-loop.ll
@@ -324,7 +324,7 @@ end:
 ; // -----
 
 ; Verify the unused access group is not imported.
-; CHECK-COUNT1: #llvm.access_group
+; CHECK-COUNT-1: #llvm.access_group
 
 ; CHECK-LABEL: @unused_parallel_access
 define void @unused_parallel_access(ptr %arg) {
diff --git a/mlir/test/Target/LLVMIR/llvmir-debug.mlir b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
index 1cb94bca169d..2792f13e4ef8 100644
--- a/mlir/test/Target/LLVMIR/llvmir-debug.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
@@ -234,7 +234,7 @@ llvm.func @func_with_inlined_dbg_value(%arg0: i32) -> (i32) {
 // CHECK-DAG: ![[LEXICAL_BLOCK_FILE:.*]] = distinct !DILexicalBlockFile(scope: ![[INNER_FUNC]], file: ![[FILE]], discriminator: 0)
 // CHECK-DAG: ![[VAR_LOC0]] = !DILocalVariable(name: "a", scope: ![[OUTER_FUNC]], file: ![[FILE]]
 // CHECK-DAG: ![[VAR_LOC1]] = !DILocalVariable(name: "b", scope: ![[LEXICAL_BLOCK_FILE]], file: ![[FILE]]
-// CHECK-DAG  ![[LABEL]] = !DILabel(scope: ![[LEXICAL_BLOCK_FILE]], name: "label", file: ![[FILE]], line: 42)
+// CHECK-DAG: ![[LABEL]] = !DILabel(scope: ![[LEXICAL_BLOCK_FILE]], name: "label", file: ![[FILE]], line: 42)
 
 // -----
 
diff --git a/mlir/test/Target/LLVMIR/omptarget-array-sectioning-host.mlir b/mlir/test/Target/LLVMIR/omptarget-array-sectioning-host.mlir
index 08ccbf04014a..0016a1f05a2b 100644
--- a/mlir/test/Target/LLVMIR/omptarget-array-sectioning-host.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-array-sectioning-host.mlir
@@ -42,7 +42,7 @@ module attributes {omp.is_target_device = false} {
 
 // CHECK: @.offload_sizes = private unnamed_addr constant [2 x i64] [i64 36, i64 108]
 // CHECK: @.offload_maptypes = private unnamed_addr constant [2 x i64] [i64 35, i64 35]
-// CHECKL: @.offload_mapnames = private constant [2 x ptr] [ptr @0, ptr @1]
+// CHECK: @.offload_mapnames = private constant [2 x ptr] [ptr @0, ptr @1]
 
 // CHECK: define void @_3d_target_array_section()
 
diff --git a/mlir/test/Transforms/test-convert-func-op.mlir b/mlir/test/Transforms/test-convert-func-op.mlir
new file mode 100644
index 000000000000..6e96703cda57
--- /dev/null
+++ b/mlir/test/Transforms/test-convert-func-op.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-opt %s -test-convert-func-op | FileCheck %s
+
+// CHECK-LABEL: llvm.func @add
+func.func @add(%arg0: i32, %arg1: i32) -> i32 attributes { llvm.emit_c_interface } {
+  %res = arith.addi %arg0, %arg1 : i32
+  return %res : i32
+}
+// CHECK-LABEL: llvm.func @_mlir_ciface_add
+// CHECK-SAME: [[ARG0:%[a-zA-Z0-9_]+]]: i32
+// CHECK-SAME: [[ARG1:%[a-zA-Z0-9_]+]]: i32
+// CHECK-NEXT: [[RES:%.*]] = llvm.call @add([[ARG0]], [[ARG1]])
+// CHECK-NEXT: llvm.return [[RES]]
diff --git a/mlir/test/lib/Analysis/CMakeLists.txt b/mlir/test/lib/Analysis/CMakeLists.txt
index d168888c1e71..7c6b31ae8b73 100644
--- a/mlir/test/lib/Analysis/CMakeLists.txt
+++ b/mlir/test/lib/Analysis/CMakeLists.txt
@@ -10,6 +10,7 @@ add_mlir_library(MLIRTestAnalysis
   TestMemRefDependenceCheck.cpp
   TestMemRefStrideCalculation.cpp
   TestSlice.cpp
+  TestTopologicalSort.cpp
 
   DataFlow/TestDeadCodeAnalysis.cpp
   DataFlow/TestDenseBackwardDataFlowAnalysis.cpp
diff --git a/mlir/test/lib/Analysis/TestSlice.cpp b/mlir/test/lib/Analysis/TestSlice.cpp
index b445febde597..7e8320dbf3ec 100644
--- a/mlir/test/lib/Analysis/TestSlice.cpp
+++ b/mlir/test/lib/Analysis/TestSlice.cpp
@@ -1,4 +1,4 @@
-//===------------- TestSlice.cpp - Test slice related analisis ------------===//
+//===- TestSlice.cpp - Test slice related analisis ------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,13 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/Pass/Pass.h"
 
 using namespace mlir;
 
-static const StringLiteral kOrderMarker = "__test_sort_original_idx__";
+static const StringLiteral kToSortMark = "test_to_sort";
+static const StringLiteral kOrderIndex = "test_sort_index";
 
 namespace {
 
@@ -23,23 +25,20 @@ struct TestTopologicalSortPass
 
   StringRef getArgument() const final { return "test-print-topological-sort"; }
   StringRef getDescription() const final {
-    return "Print operations in topological order";
+    return "Sorts operations topologically and attaches attributes with their "
+           "corresponding index in the ordering to them";
   }
   void runOnOperation() override {
-    std::map<int, Operation *> ops;
-    getOperation().walk([&ops](Operation *op) {
-      if (auto originalOrderAttr = op->getAttrOfType<IntegerAttr>(kOrderMarker))
-        ops[originalOrderAttr.getInt()] = op;
+    SetVector<Operation *> toSort;
+    getOperation().walk([&](Operation *op) {
+      if (op->hasAttrOfType<UnitAttr>(kToSortMark))
+        toSort.insert(op);
     });
-    SetVector<Operation *> sortedOp;
-    for (auto op : ops)
-      sortedOp.insert(op.second);
-    sortedOp = topologicalSort(sortedOp);
-    llvm::errs() << "Testing : " << getOperation().getName() << "\n";
-    for (Operation *op : sortedOp) {
-      op->print(llvm::errs());
-      llvm::errs() << "\n";
-    }
+
+    auto i32Type = IntegerType::get(&getContext(), 32);
+    SetVector<Operation *> sortedOps = topologicalSort(toSort);
+    for (auto [index, op] : llvm::enumerate(sortedOps))
+      op->setAttr(kOrderIndex, IntegerAttr::get(i32Type, index));
   }
 };
 
diff --git a/mlir/test/lib/Transforms/TestTopologicalSort.cpp b/mlir/test/lib/Analysis/TestTopologicalSort.cpp
index 3b110c712620..c7e0206b2a4d 100644
--- a/mlir/test/lib/Transforms/TestTopologicalSort.cpp
+++ b/mlir/test/lib/Analysis/TestTopologicalSort.cpp
@@ -6,10 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/TopologicalSortUtils.h"
 
 using namespace mlir;
 
diff --git a/mlir/test/lib/Conversion/FuncToLLVM/CMakeLists.txt b/mlir/test/lib/Conversion/FuncToLLVM/CMakeLists.txt
index 45ba62d839d3..d3dbc94a99bc 100644
--- a/mlir/test/lib/Conversion/FuncToLLVM/CMakeLists.txt
+++ b/mlir/test/lib/Conversion/FuncToLLVM/CMakeLists.txt
@@ -1,6 +1,7 @@
 # Exclude tests from libMLIR.so
 add_mlir_library(MLIRTestFuncToLLVM
   TestConvertCallOp.cpp
+  TestConvertFuncOp.cpp
 
   EXCLUDE_FROM_LIBMLIR
 
diff --git a/mlir/test/lib/Conversion/FuncToLLVM/TestConvertFuncOp.cpp b/mlir/test/lib/Conversion/FuncToLLVM/TestConvertFuncOp.cpp
new file mode 100644
index 000000000000..e25e890e2290
--- /dev/null
+++ b/mlir/test/lib/Conversion/FuncToLLVM/TestConvertFuncOp.cpp
@@ -0,0 +1,93 @@
+//===- TestConvertFuncOp.cpp - Test LLVM Conversion of Func FuncOp --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TestDialect.h"
+
+#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
+#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
+#include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+
+/// Test helper Conversion Pattern to directly call `convertFuncOpToLLVMFuncOp`
+/// to verify this utility function includes all functionalities of conversion
+struct FuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
+  FuncOpConversion(const LLVMTypeConverter &converter)
+      : ConvertOpToLLVMPattern(converter) {}
+
+  LogicalResult
+  matchAndRewrite(func::FuncOp funcOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    FailureOr<LLVM::LLVMFuncOp> newFuncOp = mlir::convertFuncOpToLLVMFuncOp(
+        cast<FunctionOpInterface>(funcOp.getOperation()), rewriter,
+        *getTypeConverter());
+    if (failed(newFuncOp))
+      return rewriter.notifyMatchFailure(funcOp, "Could not convert funcop");
+
+    rewriter.eraseOp(funcOp);
+    return success();
+  }
+};
+
+struct ReturnOpConversion : public ConvertOpToLLVMPattern<func::ReturnOp> {
+  ReturnOpConversion(const LLVMTypeConverter &converter)
+      : ConvertOpToLLVMPattern(converter) {}
+
+  LogicalResult
+  matchAndRewrite(func::ReturnOp returnOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(returnOp,
+                                                returnOp->getOperands());
+    return success();
+  }
+};
+
+struct TestConvertFuncOp
+    : public PassWrapper<TestConvertFuncOp, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestConvertFuncOp)
+
+  void getDependentDialects(DialectRegistry &registry) const final {
+    registry.insert<LLVM::LLVMDialect>();
+  }
+
+  StringRef getArgument() const final { return "test-convert-func-op"; }
+
+  StringRef getDescription() const final {
+    return "Tests conversion of `func.func` to `llvm.func` for different "
+           "attributes";
+  }
+
+  void runOnOperation() override {
+    MLIRContext *ctx = &getContext();
+
+    LowerToLLVMOptions options(ctx);
+    // Populate type conversions.
+    LLVMTypeConverter typeConverter(ctx, options);
+
+    RewritePatternSet patterns(ctx);
+    patterns.add<FuncOpConversion>(typeConverter);
+    patterns.add<ReturnOpConversion>(typeConverter);
+
+    LLVMConversionTarget target(getContext());
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns))))
+      signalPassFailure();
+  }
+};
+
+} // namespace
+
+namespace mlir::test {
+void registerConvertFuncOpPass() { PassRegistration<TestConvertFuncOp>(); }
+} // namespace mlir::test
diff --git a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
index bfee0391f670..b058a8e1abbc 100644
--- a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
+++ b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
@@ -706,11 +706,20 @@ void TestReflectBoundsOp::inferResultRanges(
   const ConstantIntRanges &range = argRanges[0];
   MLIRContext *ctx = getContext();
   Builder b(ctx);
-  auto intTy = getType();
-  setUminAttr(b.getIntegerAttr(intTy, range.umin()));
-  setUmaxAttr(b.getIntegerAttr(intTy, range.umax()));
-  setSminAttr(b.getIntegerAttr(intTy, range.smin()));
-  setSmaxAttr(b.getIntegerAttr(intTy, range.smax()));
+  Type sIntTy, uIntTy;
+  // For plain `IntegerType`s, we can derive the appropriate signed and unsigned
+  // Types for the Attributes.
+  if (auto intTy = llvm::dyn_cast<IntegerType>(getType())) {
+    unsigned bitwidth = intTy.getWidth();
+    sIntTy = b.getIntegerType(bitwidth, /*isSigned=*/true);
+    uIntTy = b.getIntegerType(bitwidth, /*isSigned=*/false);
+  } else
+    sIntTy = uIntTy = getType();
+
+  setUminAttr(b.getIntegerAttr(uIntTy, range.umin()));
+  setUmaxAttr(b.getIntegerAttr(uIntTy, range.umax()));
+  setSminAttr(b.getIntegerAttr(sIntTy, range.smin()));
+  setSmaxAttr(b.getIntegerAttr(sIntTy, range.smax()));
   setResultRanges(getResult(), range);
 }
 
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index c5d0341b7de7..faf70ad91b06 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -1697,6 +1697,12 @@ def : Pat<
                     ConstantStrAttr<StrAttr, "MatchVariadic">)>;
 
 def : Pat<
+  (MixedVOperandOp5 $input1a, $input1b, $input2, $attr1,
+                    ConstantStrAttr<StrAttr, "MatchInverseVariadic">),
+  (MixedVOperandOp3 $input2, (variadic $input1b), (variadic $input1a),
+                    ConstantAttr<I32Attr, "1">:$attr1)>;
+
+def : Pat<
   (MixedVOperandOp4 (variadic (MixedVOperandInOutI32Op $input1a),
                               (MixedVOperandInOutI32Op $input1b)),
                     $input2, ConstantAttr<I32Attr, "1">:$attr1),
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index a849b7ebd29e..975a41ac3d5f 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -26,7 +26,6 @@ add_mlir_library(MLIRTestTransforms
   TestInlining.cpp
   TestIntRangeInference.cpp
   TestMakeIsolatedFromAbove.cpp
-  TestTopologicalSort.cpp
   ${MLIRTestTransformsPDLSrc}
 
   EXCLUDE_FROM_LIBMLIR
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index ea6d9ae71b77..9ed3a2efcb8f 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -245,7 +245,7 @@ def have_host_jit_feature_support(feature_name):
 if have_host_jit_feature_support("jit"):
     config.available_features.add("host-supports-jit")
 
-if config.run_cuda_tests:
+if config.run_nvptx_tests:
     config.available_features.add("host-supports-nvptx")
 
 if config.run_rocm_tests:
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index c0fa1b8980e5..4f5186df7d20 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -25,7 +25,7 @@ config.mlir_cmake_dir = "@MLIR_CMAKE_DIR@"
 config.mlir_lib_dir = "@MLIR_LIB_DIR@"
 
 config.build_examples = @LLVM_BUILD_EXAMPLES@
-config.run_cuda_tests = @MLIR_ENABLE_CUDA_CONVERSIONS@
+config.run_nvptx_tests = @LLVM_HAS_NVPTX_TARGET@
 config.enable_cuda_runner = @MLIR_ENABLE_CUDA_RUNNER@
 config.run_rocm_tests = @MLIR_ENABLE_ROCM_CONVERSIONS@
 config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@
diff --git a/mlir/test/mlir-tblgen/op-decl-and-defs.td b/mlir/test/mlir-tblgen/op-decl-and-defs.td
index 499e3ceecaf0..836ddca5eb84 100644
--- a/mlir/test/mlir-tblgen/op-decl-and-defs.td
+++ b/mlir/test/mlir-tblgen/op-decl-and-defs.td
@@ -58,7 +58,8 @@ def NS_AOp : NS_Op<"a_op", [IsolatedFromAbove, IsolatedFromAbove]> {
 // CHECK: namespace detail {
 // CHECK: class AOpGenericAdaptorBase {
 // CHECK: public:
-// CHECK:   AOpGenericAdaptorBase(AOp{{[[:space:]]}}
+// CHECK:   AOpGenericAdaptorBase(::mlir::DictionaryAttr attrs = {}, const ::mlir::EmptyProperties &properties = {}, ::mlir::RegionRange regions = {}) : odsAttrs(attrs), odsRegions(regions)
+// CHECK:   AOpGenericAdaptorBase(::mlir::Operation *op) : odsAttrs(op->getRawDictionaryAttrs()), odsOpName(op->getName()), odsRegions(op->getRegions()) {}
 // CHECK:   ::mlir::IntegerAttr getAttr1Attr();
 // CHECK:   uint32_t getAttr1();
 // CHECK:   ::mlir::FloatAttr getSomeAttr2Attr();
@@ -128,15 +129,8 @@ def NS_AOp : NS_Op<"a_op", [IsolatedFromAbove, IsolatedFromAbove]> {
 
 // DEFS-LABEL: NS::AOp definitions
 
-// DEFS: AOpGenericAdaptorBase::AOpGenericAdaptorBase(::mlir::DictionaryAttr attrs, const ::mlir::EmptyProperties &properties, ::mlir::RegionRange regions) : odsAttrs(attrs), odsRegions(regions)
-
 // Check that `getAttrDictionary()` is used when not using properties.
 
-// DEFS: AOpGenericAdaptorBase::AOpGenericAdaptorBase(AOp op)
-// DEFS-SAME: op->getAttrDictionary()
-// DEFS-SAME: p.getProperties()
-// DEFS-SAME: op->getRegions()
-
 // DECLS: ::mlir::RegionRange AOpGenericAdaptorBase::getSomeRegions()
 // DECLS-NEXT: return odsRegions.drop_front(1);
 // DECLS: ::mlir::RegionRange AOpGenericAdaptorBase::getRegions()
@@ -344,12 +338,11 @@ def NS_NOp : NS_Op<"op_with_properties", []> {
   let arguments = (ins Property<"unsigned">:$value);
 }
 
-// Check that `getDiscardableAttrDictionary()` is used with properties.
-
-// DEFS: NOpGenericAdaptorBase::NOpGenericAdaptorBase(NOp op) : NOpGenericAdaptorBase(
-// DEFS-SAME: op->getDiscardableAttrDictionary()
-// DEFS-SAME: op.getProperties()
-// DEFS-SAME: op->getRegions()
+// DEFS: NOpGenericAdaptorBase::NOpGenericAdaptorBase(NOp op) :
+// DEFS-SAME: odsAttrs(op->getRawDictionaryAttrs())
+// DEFS-SAME: odsOpName(op->getName())
+// DEFS-SAME: properties(op.getProperties())
+// DEFS-SAME: odsRegions(op->getRegions())
 
 // Test that type defs have the proper namespaces when used as a constraint.
 // ---
diff --git a/mlir/test/mlir-tblgen/op-operand.td b/mlir/test/mlir-tblgen/op-operand.td
index a74970824479..a2fa1f7046a9 100644
--- a/mlir/test/mlir-tblgen/op-operand.td
+++ b/mlir/test/mlir-tblgen/op-operand.td
@@ -15,9 +15,6 @@ def OpA : NS_Op<"one_normal_operand_op", []> {
 
 // CHECK-LABEL: OpA definitions
 
-// CHECK:      OpAGenericAdaptorBase::OpAGenericAdaptorBase
-// CHECK-SAME: odsAttrs(attrs)
-
 // CHECK:      void OpA::build
 // CHECK:        ::mlir::Value input
 // CHECK:        odsState.addOperands(input);
diff --git a/mlir/test/mlir-tblgen/pattern.mlir b/mlir/test/mlir-tblgen/pattern.mlir
index 7f9c450f15b2..5ff8710b9377 100644
--- a/mlir/test/mlir-tblgen/pattern.mlir
+++ b/mlir/test/mlir-tblgen/pattern.mlir
@@ -527,6 +527,14 @@ func.func @testMatchVariadic(%arg0: i32, %arg1: i32, %arg2: i32, %arg3: i32) ->
   return
 }
 
+// CHECK-LABEL: @testReplaceVariadic
+func.func @testReplaceVariadic(%arg0: i32, %arg1: i32, %arg2: i32, %arg3: i32) -> () {
+  // CHECK: "test.mixed_variadic_in3"(%arg2, %arg1, %arg0) <{count = 1 : i32}>
+  "test.mixed_variadic_in5"(%arg0, %arg1, %arg2) <{attr1 = 0 : i32, pattern_name = "MatchInverseVariadic"}> : (i32, i32, i32) -> ()
+
+  return
+}
+
 // CHECK-LABEL: @testMatchVariadicSubDag
 func.func @testMatchVariadicSubDag(%arg0: i32, %arg1: i32, %arg2: i32) -> () {
   // CHECK: %[[IN0:.*]] = "test.mixed_variadic_in_out_i32"(%arg0) : (i32) -> i32
diff --git a/mlir/test/mlir-vulkan-runner/iaddcarry_extended.mlir b/mlir/test/mlir-vulkan-runner/addui_extended.mlir
index 9b1f1964b3f9..9b1f1964b3f9 100644
--- a/mlir/test/mlir-vulkan-runner/iaddcarry_extended.mlir
+++ b/mlir/test/mlir-vulkan-runner/addui_extended.mlir
diff --git a/mlir/test/python/dialects/transform_structured_ext.py b/mlir/test/python/dialects/transform_structured_ext.py
index 935534edba7a..f97017b7a2c7 100644
--- a/mlir/test/python/dialects/transform_structured_ext.py
+++ b/mlir/test/python/dialects/transform_structured_ext.py
@@ -443,7 +443,7 @@ def testTileExplicitLoopTypeAll(target):
     structured.TileUsingForOp(types, target, sizes=[2, 3, 4])
     # CHECK-LABEL: TEST: testTileExplicitLoopTypeAll
     # CHECK: = transform.structured.tile
-    # CHECK-SAME : (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">,
+    # CHECK-SAME: (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">,
     # CHECK-SAME: !transform.op<"scf.parallel">, !transform.op<"scf.forall">
 
 
diff --git a/mlir/test/python/ir/array_attributes.py b/mlir/test/python/ir/array_attributes.py
index 9251588a4c48..2bc403aace83 100644
--- a/mlir/test/python/ir/array_attributes.py
+++ b/mlir/test/python/ir/array_attributes.py
@@ -51,6 +51,87 @@ def testGetDenseElementsUnSupportedTypeOkIfExplicitTypeProvided():
 
 
 ################################################################################
+# Tests of the list of attributes .get() factory method
+################################################################################
+
+
+# CHECK-LABEL: TEST: testGetDenseElementsFromList
+@run
+def testGetDenseElementsFromList():
+    with Context(), Location.unknown():
+        attrs = [FloatAttr.get(F64Type.get(), 1.0), FloatAttr.get(F64Type.get(), 2.0)]
+        attr = DenseElementsAttr.get(attrs)
+
+        # CHECK: dense<[1.000000e+00, 2.000000e+00]> : tensor<2xf64>
+        print(attr)
+
+
+# CHECK-LABEL: TEST: testGetDenseElementsFromListWithExplicitType
+@run
+def testGetDenseElementsFromListWithExplicitType():
+    with Context(), Location.unknown():
+        attrs = [FloatAttr.get(F64Type.get(), 1.0), FloatAttr.get(F64Type.get(), 2.0)]
+        shaped_type = ShapedType(Type.parse("tensor<2xf64>"))
+        attr = DenseElementsAttr.get(attrs, shaped_type)
+
+        # CHECK: dense<[1.000000e+00, 2.000000e+00]> : tensor<2xf64>
+        print(attr)
+
+
+# CHECK-LABEL: TEST: testGetDenseElementsFromListEmptyList
+@run
+def testGetDenseElementsFromListEmptyList():
+    with Context(), Location.unknown():
+        attrs = []
+
+        try:
+            attr = DenseElementsAttr.get(attrs)
+        except ValueError as e:
+            # CHECK: Attributes list must be non-empty
+            print(e)
+
+
+# CHECK-LABEL: TEST: testGetDenseElementsFromListNonAttributeType
+@run
+def testGetDenseElementsFromListNonAttributeType():
+    with Context(), Location.unknown():
+        attrs = [1.0]
+
+        try:
+            attr = DenseElementsAttr.get(attrs)
+        except RuntimeError as e:
+            # CHECK: Invalid attribute when attempting to create an ArrayAttribute
+            print(e)
+
+
+# CHECK-LABEL: TEST: testGetDenseElementsFromListMismatchedType
+@run
+def testGetDenseElementsFromListMismatchedType():
+    with Context(), Location.unknown():
+        attrs = [FloatAttr.get(F64Type.get(), 1.0), FloatAttr.get(F64Type.get(), 2.0)]
+        shaped_type = ShapedType(Type.parse("tensor<2xf32>"))
+
+        try:
+            attr = DenseElementsAttr.get(attrs, shaped_type)
+        except ValueError as e:
+            # CHECK: All attributes must be of the same type and match the type parameter
+            print(e)
+
+
+# CHECK-LABEL: TEST: testGetDenseElementsFromListMixedTypes
+@run
+def testGetDenseElementsFromListMixedTypes():
+    with Context(), Location.unknown():
+        attrs = [FloatAttr.get(F64Type.get(), 1.0), FloatAttr.get(F32Type.get(), 2.0)]
+
+        try:
+            attr = DenseElementsAttr.get(attrs)
+        except ValueError as e:
+            # CHECK: All attributes must be of the same type and match the type parameter
+            print(e)
+
+
+################################################################################
 # Splats.
 ################################################################################
 
@@ -205,6 +286,7 @@ def testGetDenseElementsBoolSplat():
 
 ### float and double arrays.
 
+
 # CHECK-LABEL: TEST: testGetDenseElementsF16
 @run
 def testGetDenseElementsF16():
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 1dfc5d178b61..0e8b161d5134 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -71,6 +71,7 @@ namespace test {
 void registerTestCompositePass();
 void registerCommutativityUtils();
 void registerConvertCallOpPass();
+void registerConvertFuncOpPass();
 void registerInliner();
 void registerMemRefBoundCheck();
 void registerPatternsTestPass();
@@ -199,6 +200,7 @@ void registerTestPasses() {
   mlir::test::registerTestCompositePass();
   mlir::test::registerCommutativityUtils();
   mlir::test::registerConvertCallOpPass();
+  mlir::test::registerConvertFuncOpPass();
   mlir::test::registerInliner();
   mlir::test::registerMemRefBoundCheck();
   mlir::test::registerPatternsTestPass();
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index e013ccac5dd0..adda7ce6fc6c 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -4101,7 +4101,8 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(
                              "{}");
     }
     paramList.emplace_back("::mlir::RegionRange", "regions", "{}");
-    auto *baseConstructor = genericAdaptorBase.addConstructor(paramList);
+    auto *baseConstructor =
+        genericAdaptorBase.addConstructor<Method::Inline>(paramList);
     baseConstructor->addMemberInitializer("odsAttrs", "attrs");
     if (useProperties)
       baseConstructor->addMemberInitializer("properties", "properties");
@@ -4163,14 +4164,24 @@ OpOperandAdaptorEmitter::OpOperandAdaptorEmitter(
   // and the value range from the parameter.
   {
     // Base class is in the cpp file and can simply access the members of the op
-    // class to initialize the template independent fields.
-    auto *constructor = genericAdaptorBase.addConstructor(
-        MethodParameter(op.getCppClassName(), "op"));
-    constructor->addMemberInitializer(
-        genericAdaptorBase.getClassName(),
-        llvm::Twine(!useProperties ? "op->getAttrDictionary()"
-                                   : "op->getDiscardableAttrDictionary()") +
-            ", op.getProperties(), op->getRegions()");
+    // class to initialize the template independent fields. If the op doesn't
+    // have properties, we can emit a generic constructor inline. Otherwise,
+    // emit it out-of-line because we need the op to be defined.
+    Constructor *constructor;
+    if (useProperties) {
+      constructor = genericAdaptorBase.addConstructor(
+          MethodParameter(op.getCppClassName(), "op"));
+    } else {
+      constructor = genericAdaptorBase.addConstructor<Method::Inline>(
+          MethodParameter("::mlir::Operation *", "op"));
+    }
+    constructor->addMemberInitializer("odsAttrs",
+                                      "op->getRawDictionaryAttrs()");
+    // Retrieve the operation name from the op directly.
+    constructor->addMemberInitializer("odsOpName", "op->getName()");
+    if (useProperties)
+      constructor->addMemberInitializer("properties", "op.getProperties()");
+    constructor->addMemberInitializer("odsRegions", "op->getRegions()");
 
     // Generic adaptor is templated and therefore defined inline in the header.
     // We cannot use the Op class here as it is an incomplete type (we have a
diff --git a/mlir/tools/mlir-tblgen/RewriterGen.cpp b/mlir/tools/mlir-tblgen/RewriterGen.cpp
index e63a065a0708..d8e16d98fd75 100644
--- a/mlir/tools/mlir-tblgen/RewriterGen.cpp
+++ b/mlir/tools/mlir-tblgen/RewriterGen.cpp
@@ -159,6 +159,10 @@ private:
   // Returns the symbol of the old value serving as the replacement.
   StringRef handleReplaceWithValue(DagNode tree);
 
+  // Emits the C++ statement to replace the matched DAG with an array of
+  // matched values.
+  std::string handleVariadic(DagNode tree, int depth);
+
   // Trailing directives are used at the end of DAG node argument lists to
   // specify additional behaviour for op matchers and creators, etc.
   struct TrailingDirectives {
@@ -1241,6 +1245,9 @@ std::string PatternEmitter::handleResultPattern(DagNode resultTree,
   if (resultTree.isReplaceWithValue())
     return handleReplaceWithValue(resultTree).str();
 
+  if (resultTree.isVariadic())
+    return handleVariadic(resultTree, depth);
+
   // Normal op creation.
   auto symbol = handleOpCreation(resultTree, resultIndex, depth);
   if (resultTree.getSymbol().empty()) {
@@ -1251,6 +1258,26 @@ std::string PatternEmitter::handleResultPattern(DagNode resultTree,
   return symbol;
 }
 
+std::string PatternEmitter::handleVariadic(DagNode tree, int depth) {
+  assert(tree.isVariadic());
+
+  auto name = std::string(formatv("tblgen_variadic_values_{0}", nextValueId++));
+  symbolInfoMap.bindValue(name);
+  os << "::llvm::SmallVector<::mlir::Value, 4> " << name << ";\n";
+  for (int i = 0, e = tree.getNumArgs(); i != e; ++i) {
+    if (auto child = tree.getArgAsNestedDag(i)) {
+      os << name << ".push_back(" << handleResultPattern(child, i, depth + 1)
+         << ");\n";
+    } else {
+      os << name << ".push_back("
+         << handleOpArgument(tree.getArgAsLeaf(i), tree.getArgName(i))
+         << ");\n";
+    }
+  }
+
+  return name;
+}
+
 StringRef PatternEmitter::handleReplaceWithValue(DagNode tree) {
   assert(tree.isReplaceWithValue());
 
diff --git a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
index cea49356538f..a8fe20d52fb2 100644
--- a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
+++ b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
@@ -30,7 +30,7 @@
 using namespace mlir;
 
 // Skip the test if the NVPTX target was not built.
-#if MLIR_ENABLE_CUDA_CONVERSIONS
+#if LLVM_HAS_NVPTX_TARGET
 #define SKIP_WITHOUT_NVPTX(x) x
 #else
 #define SKIP_WITHOUT_NVPTX(x) DISABLED_##x
diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index c3dcebfb7301..ef90dc90bf11 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -143,9 +143,31 @@ set(LIBOMPTARGET_PLUGINS_TO_BUILD "all" CACHE STRING
 if(LIBOMPTARGET_PLUGINS_TO_BUILD STREQUAL "all")
   set(LIBOMPTARGET_PLUGINS_TO_BUILD ${LIBOMPTARGET_ALL_PLUGIN_TARGETS})
 endif()
+
+if(NOT CMAKE_SYSTEM_NAME MATCHES "Linux" AND
+   "host" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
+  message(STATUS "Not building host plugin: only Linux systems are supported")
+  list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "host")
+endif()
+if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$"
+        AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
+  if("amdgpu" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
+    message(STATUS "Not building AMDGPU plugin: only support AMDGPU in "
+                   "Linux x86_64, ppc64le, or aarch64 hosts")
+    list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "amdgpu")
+  endif()
+  if("nvptx" IN_LIST LIBOMPTARGET_PLUGINS_TO_BUILD)
+    message(STATUS "Not building CUDA plugin: only support AMDGPU in "
+                   "Linux x86_64, ppc64le, or aarch64 hosts")
+    list(REMOVE_ITEM LIBOMPTARGET_PLUGINS_TO_BUILD "cuda")
+  endif()
+endif()
 message(STATUS "Building the offload library with support for "
                "the \"${LIBOMPTARGET_PLUGINS_TO_BUILD}\" plugins")
 
+set(LIBOMPTARGET_DLOPEN_PLUGINS "${LIBOMPTARGET_PLUGINS_TO_BUILD}" CACHE STRING
+    "Semicolon-separated list of plugins to use 'dlopen' for runtime linking")
+
 set(LIBOMPTARGET_ENUM_PLUGIN_TARGETS "")
 foreach(plugin IN LISTS LIBOMPTARGET_PLUGINS_TO_BUILD)
   set(LIBOMPTARGET_ENUM_PLUGIN_TARGETS
diff --git a/offload/DeviceRTL/include/Utils.h b/offload/DeviceRTL/include/Utils.h
index 4ab0aea46eea..d43b7f5c95de 100644
--- a/offload/DeviceRTL/include/Utils.h
+++ b/offload/DeviceRTL/include/Utils.h
@@ -25,6 +25,8 @@ int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
 
 int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width);
 
+uint64_t ballotSync(uint64_t Mask, int32_t Pred);
+
 /// Return \p LowBits and \p HighBits packed into a single 64 bit value.
 uint64_t pack(uint32_t LowBits, uint32_t HighBits);
 
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index b2028a8fb4f5..4f39d2a299ee 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -364,4 +364,8 @@ _TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel)
 _TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock)
 _TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel)
 
+extern "C" uint64_t ompx_ballot_sync(uint64_t mask, int pred) {
+  return utils::ballotSync(mask, pred);
+}
+
 #pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Utils.cpp b/offload/DeviceRTL/src/Utils.cpp
index d07ac0fb499c..606e3bec0d33 100644
--- a/offload/DeviceRTL/src/Utils.cpp
+++ b/offload/DeviceRTL/src/Utils.cpp
@@ -37,6 +37,8 @@ int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
 int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
                     int32_t Width);
 
+uint64_t ballotSync(uint64_t Mask, int32_t Pred);
+
 /// AMDGCN Implementation
 ///
 ///{
@@ -57,6 +59,10 @@ int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
   return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
 }
 
+uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
+  return Mask & __builtin_amdgcn_ballot_w64(Pred);
+}
+
 bool isSharedMemPtr(const void *Ptr) {
   return __builtin_amdgcn_is_shared(
       (const __attribute__((address_space(0))) void *)Ptr);
@@ -80,6 +86,10 @@ int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
   return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
 }
 
+uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
+  return __nvvm_vote_ballot_sync(static_cast<uint32_t>(Mask), Pred);
+}
+
 bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
 
 #pragma omp end declare variant
@@ -103,6 +113,10 @@ int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
   return impl::shuffleDown(Mask, Var, Delta, Width);
 }
 
+uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) {
+  return impl::ballotSync(Mask, Pred);
+}
+
 bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); }
 
 extern "C" {
diff --git a/offload/cmake/Modules/LibomptargetGetDependencies.cmake b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
index e37b86b2a81f..c296f7ea3863 100644
--- a/offload/cmake/Modules/LibomptargetGetDependencies.cmake
+++ b/offload/cmake/Modules/LibomptargetGetDependencies.cmake
@@ -3,7 +3,6 @@
 #
 # libffi : required to launch target kernels given function and argument
 #          pointers.
-# CUDA : required to control offloading to NVIDIA GPUs.
 
 include (FindPackageHandleStandardArgs)
 
@@ -44,13 +43,6 @@ find_package(FFI QUIET)
 set(LIBOMPTARGET_DEP_LIBFFI_FOUND ${FFI_FOUND})
 
 ################################################################################
-# Looking for CUDA...
-################################################################################
-
-find_package(CUDAToolkit QUIET)
-set(LIBOMPTARGET_DEP_CUDA_FOUND ${CUDAToolkit_FOUND})
-
-################################################################################
 # Looking for NVIDIA GPUs...
 ################################################################################
 set(LIBOMPTARGET_DEP_CUDA_ARCH "sm_35")
diff --git a/offload/plugins-nextgen/amdgpu/CMakeLists.txt b/offload/plugins-nextgen/amdgpu/CMakeLists.txt
index 2f4057c0ae7e..47cd2feefc72 100644
--- a/offload/plugins-nextgen/amdgpu/CMakeLists.txt
+++ b/offload/plugins-nextgen/amdgpu/CMakeLists.txt
@@ -1,11 +1,6 @@
 # As of rocm-3.7, hsa is installed with cmake packages and kmt is found via hsa
 find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
 
-if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
-  message(STATUS "Not building AMDGPU NextGen plugin: only support AMDGPU in Linux x86_64, ppc64le, or aarch64 hosts")
-  return()
-endif()
-
 # Create the library and add the default arguments.
 add_target_library(omptarget.rtl.amdgpu AMDGPU)
 
@@ -13,8 +8,7 @@ target_sources(omptarget.rtl.amdgpu PRIVATE src/rtl.cpp)
 target_include_directories(omptarget.rtl.amdgpu PRIVATE
                            ${CMAKE_CURRENT_SOURCE_DIR}/utils)
 
-option(LIBOMPTARGET_FORCE_DLOPEN_LIBHSA "Build with dlopened libhsa" OFF)
-if(hsa-runtime64_FOUND AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBHSA)
+if(hsa-runtime64_FOUND AND NOT "amdgpu" IN_LIST LIBOMPTARGET_DLOPEN_PLUGINS)
   message(STATUS "Building AMDGPU plugin linked against libhsa")
   target_link_libraries(omptarget.rtl.amdgpu PRIVATE hsa-runtime64::hsa-runtime64)
 else()
diff --git a/offload/plugins-nextgen/common/include/JIT.h b/offload/plugins-nextgen/common/include/JIT.h
index b22197b89208..4414926a6178 100644
--- a/offload/plugins-nextgen/common/include/JIT.h
+++ b/offload/plugins-nextgen/common/include/JIT.h
@@ -55,10 +55,6 @@ struct JITEngine {
   process(const __tgt_device_image &Image,
           target::plugin::GenericDeviceTy &Device);
 
-  /// Return true if \p Image is a bitcode image that can be JITed for the given
-  /// architecture.
-  Expected<bool> checkBitcodeImage(StringRef Buffer) const;
-
 private:
   /// Compile the bitcode image \p Image and generate the binary image that can
   /// be loaded to the target device of the triple \p Triple architecture \p
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 83f6e8d76fec..eda6a4fd541e 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -1052,6 +1052,10 @@ struct GenericPluginTy {
   /// given target. Returns true if the \p Image is compatible with the plugin.
   Expected<bool> checkELFImage(StringRef Image) const;
 
+  /// Return true if the \p Image can be compiled to run on the platform's
+  /// target architecture.
+  Expected<bool> checkBitcodeImage(StringRef Image) const;
+
   /// Indicate if an image is compatible with the plugin devices. Notice that
   /// this function may be called before actually initializing the devices. So
   /// we could not move this function into GenericDeviceTy.
@@ -1066,8 +1070,11 @@ protected:
 public:
   // TODO: This plugin interface needs to be cleaned up.
 
+  /// Returns true if the plugin has been initialized.
+  int32_t is_initialized() const;
+
   /// Returns non-zero if the provided \p Image can be executed by the runtime.
-  int32_t is_valid_binary(__tgt_device_image *Image);
+  int32_t is_valid_binary(__tgt_device_image *Image, bool Initialized = true);
 
   /// Initialize the device inside of the plugin.
   int32_t init_device(int32_t DeviceId);
@@ -1187,6 +1194,9 @@ public:
                        void **KernelPtr);
 
 private:
+  /// Indicates if the platform runtime has been fully initialized.
+  bool Initialized = false;
+
   /// Number of devices available for the plugin.
   int32_t NumDevices = 0;
 
diff --git a/offload/plugins-nextgen/common/src/JIT.cpp b/offload/plugins-nextgen/common/src/JIT.cpp
index 9d58e6060646..9dbba1459839 100644
--- a/offload/plugins-nextgen/common/src/JIT.cpp
+++ b/offload/plugins-nextgen/common/src/JIT.cpp
@@ -323,19 +323,3 @@ JITEngine::process(const __tgt_device_image &Image,
 
   return &Image;
 }
-
-Expected<bool> JITEngine::checkBitcodeImage(StringRef Buffer) const {
-  TimeTraceScope TimeScope("Check bitcode image");
-
-  assert(identify_magic(Buffer) == file_magic::bitcode &&
-         "Input is not bitcode");
-
-  LLVMContext Context;
-  auto ModuleOrErr = getLazyBitcodeModule(MemoryBufferRef(Buffer, ""), Context,
-                                          /*ShouldLazyLoadMetadata=*/true);
-  if (!ModuleOrErr)
-    return ModuleOrErr.takeError();
-  Module &M = **ModuleOrErr;
-
-  return Triple(M.getTargetTriple()).getArch() == TT.getArch();
-}
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 550ebc9c28b2..913721a15d71 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -24,6 +24,7 @@
 #include "omp-tools.h"
 #endif
 
+#include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/JSON.h"
@@ -1495,6 +1496,7 @@ Error GenericPluginTy::init() {
   if (!NumDevicesOrErr)
     return NumDevicesOrErr.takeError();
 
+  Initialized = true;
   NumDevices = *NumDevicesOrErr;
   if (NumDevices == 0)
     return Plugin::success();
@@ -1578,14 +1580,27 @@ Expected<bool> GenericPluginTy::checkELFImage(StringRef Image) const {
   if (!MachineOrErr)
     return MachineOrErr.takeError();
 
-  if (!*MachineOrErr)
+  return MachineOrErr;
+}
+
+Expected<bool> GenericPluginTy::checkBitcodeImage(StringRef Image) const {
+  if (identify_magic(Image) != file_magic::bitcode)
     return false;
 
-  // Perform plugin-dependent checks for the specific architecture if needed.
-  return isELFCompatible(Image);
+  LLVMContext Context;
+  auto ModuleOrErr = getLazyBitcodeModule(MemoryBufferRef(Image, ""), Context,
+                                          /*ShouldLazyLoadMetadata=*/true);
+  if (!ModuleOrErr)
+    return ModuleOrErr.takeError();
+  Module &M = **ModuleOrErr;
+
+  return Triple(M.getTargetTriple()).getArch() == getTripleArch();
 }
 
-int32_t GenericPluginTy::is_valid_binary(__tgt_device_image *Image) {
+int32_t GenericPluginTy::is_initialized() const { return Initialized; }
+
+int32_t GenericPluginTy::is_valid_binary(__tgt_device_image *Image,
+                                         bool Initialized) {
   StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart),
                    target::getPtrDiff(Image->ImageEnd, Image->ImageStart));
 
@@ -1603,10 +1618,17 @@ int32_t GenericPluginTy::is_valid_binary(__tgt_device_image *Image) {
     auto MatchOrErr = checkELFImage(Buffer);
     if (Error Err = MatchOrErr.takeError())
       return HandleError(std::move(Err));
-    return *MatchOrErr;
+    if (!Initialized || !*MatchOrErr)
+      return *MatchOrErr;
+
+    // Perform plugin-dependent checks for the specific architecture if needed.
+    auto CompatibleOrErr = isELFCompatible(Buffer);
+    if (Error Err = CompatibleOrErr.takeError())
+      return HandleError(std::move(Err));
+    return *CompatibleOrErr;
   }
   case file_magic::bitcode: {
-    auto MatchOrErr = getJIT().checkBitcodeImage(Buffer);
+    auto MatchOrErr = checkBitcodeImage(Buffer);
     if (Error Err = MatchOrErr.takeError())
       return HandleError(std::move(Err));
     return *MatchOrErr;
diff --git a/offload/plugins-nextgen/cuda/CMakeLists.txt b/offload/plugins-nextgen/cuda/CMakeLists.txt
index 10ff612848ad..5fdfb8f9cf62 100644
--- a/offload/plugins-nextgen/cuda/CMakeLists.txt
+++ b/offload/plugins-nextgen/cuda/CMakeLists.txt
@@ -1,17 +1,10 @@
-if (NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
-  message(STATUS "Not building CUDA NextGen offloading plugin: only support CUDA in Linux x86_64, ppc64le, or aarch64 hosts.")
-  return()
-endif()
-
-message(STATUS "Building CUDA NextGen offloading plugin.")
-
 # Create the library and add the default arguments.
 add_target_library(omptarget.rtl.cuda CUDA)
 
 target_sources(omptarget.rtl.cuda PRIVATE src/rtl.cpp)
 
-option(LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA "Build with dlopened libcuda" OFF)
-if(LIBOMPTARGET_DEP_CUDA_FOUND AND NOT LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
+find_package(CUDAToolkit QUIET)
+if(CUDAToolkit_FOUND AND NOT "cuda" IN_LIST LIBOMPTARGET_DLOPEN_PLUGINS)
   message(STATUS "Building CUDA plugin linked against libcuda")
   target_link_libraries(omptarget.rtl.cuda PRIVATE CUDA::cuda_driver)
 else()
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 32031c28f879..d65e5cf61e09 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -16,6 +16,15 @@
 #include <cstddef>
 #include <cstdint>
 
+#define cuDeviceTotalMem cuDeviceTotalMem_v2
+#define cuModuleGetGlobal cuModuleGetGlobal_v2
+#define cuMemGetInfo cuMemGetInfo_v2
+#define cuMemAlloc cuMemAlloc_v2
+#define cuMemFree cuMemFree_v2
+#define cuMemAllocHost cuMemAllocHost_v2
+#define cuDevicePrimaryCtxRelease cuDevicePrimaryCtxRelease_v2
+#define cuDevicePrimaryCtxSetFlags cuDevicePrimaryCtxSetFlags_v2
+
 typedef int CUdevice;
 typedef uintptr_t CUdeviceptr;
 typedef struct CUmod_st *CUmodule;
diff --git a/offload/plugins-nextgen/host/CMakeLists.txt b/offload/plugins-nextgen/host/CMakeLists.txt
index 9c6aa274921b..817d128f9241 100644
--- a/offload/plugins-nextgen/host/CMakeLists.txt
+++ b/offload/plugins-nextgen/host/CMakeLists.txt
@@ -1,7 +1,3 @@
-if(NOT CMAKE_SYSTEM_NAME MATCHES "Linux")
-  return()
-endif()
-
 set(supported_targets x86_64 aarch64 ppc64 ppc64le s390x)
 if(NOT ${CMAKE_SYSTEM_PROCESSOR} IN_LIST supported_targets)
   message(STATUS "Not building ${machine} NextGen offloading plugin")
diff --git a/offload/src/PluginManager.cpp b/offload/src/PluginManager.cpp
index 191afa345641..f72007849e36 100644
--- a/offload/src/PluginManager.cpp
+++ b/offload/src/PluginManager.cpp
@@ -34,15 +34,8 @@ void PluginManager::init() {
   // Attempt to create an instance of each supported plugin.
 #define PLUGIN_TARGET(Name)                                                    \
   do {                                                                         \
-    auto Plugin = std::unique_ptr<GenericPluginTy>(createPlugin_##Name());     \
-    if (auto Err = Plugin->init()) {                                           \
-      [[maybe_unused]] std::string InfoMsg = toString(std::move(Err));         \
-      DP("Failed to init plugin: %s\n", InfoMsg.c_str());                      \
-    } else {                                                                   \
-      DP("Registered plugin %s with %d visible device(s)\n",                   \
-         Plugin->getName(), Plugin->number_of_devices());                      \
-      Plugins.emplace_back(std::move(Plugin));                                 \
-    }                                                                          \
+    Plugins.emplace_back(                                                      \
+        std::unique_ptr<GenericPluginTy>(createPlugin_##Name()));              \
   } while (false);
 #include "Shared/Targets.def"
 
@@ -160,6 +153,27 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) {
     if (Entry.flags == OMP_REGISTER_REQUIRES)
       PM->addRequirements(Entry.data);
 
+  // Initialize all the plugins that have associated images.
+  for (auto &Plugin : Plugins) {
+    if (Plugin->is_initialized())
+      continue;
+
+    // Extract the exectuable image and extra information if availible.
+    for (int32_t i = 0; i < Desc->NumDeviceImages; ++i) {
+      if (!Plugin->is_valid_binary(&Desc->DeviceImages[i],
+                                   /*Initialized=*/false))
+        continue;
+
+      if (auto Err = Plugin->init()) {
+        [[maybe_unused]] std::string InfoMsg = toString(std::move(Err));
+        DP("Failed to init plugin: %s\n", InfoMsg.c_str());
+      } else {
+        DP("Registered plugin %s with %d visible device(s)\n",
+           Plugin->getName(), Plugin->number_of_devices());
+      }
+    }
+  }
+
   // Extract the exectuable image and extra information if availible.
   for (int32_t i = 0; i < Desc->NumDeviceImages; ++i)
     PM->addDeviceImage(*Desc, Desc->DeviceImages[i]);
@@ -177,7 +191,7 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) {
       if (!R.number_of_devices())
         continue;
 
-      if (!R.is_valid_binary(Img)) {
+      if (!R.is_valid_binary(Img, /*Initialized=*/true)) {
         DP("Image " DPxMOD " is NOT compatible with RTL %s!\n",
            DPxPTR(Img->ImageStart), R.getName());
         continue;
diff --git a/offload/test/offloading/dynamic_module.c b/offload/test/offloading/dynamic_module.c
index f1e9862002a1..9dcf3a1ae649 100644
--- a/offload/test/offloading/dynamic_module.c
+++ b/offload/test/offloading/dynamic_module.c
@@ -2,6 +2,8 @@
 // RUN: %libomptarget-compile-generic %t.so && %libomptarget-run-generic 2>&1 | %fcheck-generic
 // RUN: %libomptarget-compileopt-generic -DSHARED -fPIC -shared -o %t.so && \
 // RUN: %libomptarget-compileopt-generic %t.so && %libomptarget-run-generic 2>&1 | %fcheck-generic
+//
+// UNSUPPORTED: x86_64-pc-linux-gnu
 
 #ifdef SHARED
 void foo() {}
diff --git a/offload/test/offloading/fortran/dump_map_tables.f90 b/offload/test/offloading/fortran/dump_map_tables.f90
new file mode 100644
index 000000000000..cb66ef348e3c
--- /dev/null
+++ b/offload/test/offloading/fortran/dump_map_tables.f90
@@ -0,0 +1,38 @@
+! Offloading test with runtine call to ompx_dump_mapping_tables
+! Fortran array writing some values and printing the variable mapped to device
+! correctly receives the updates made on the device.
+! REQUIRES: flang
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+
+program map_dump_example
+  INTERFACE
+    SUBROUTINE ompx_dump_mapping_tables() BIND(C)
+    END SUBROUTINE ompx_dump_mapping_tables
+  END INTERFACE
+
+  integer i,j,k,N
+  integer async_q(4)
+  real :: A(5000000)
+  N=5000000
+  do i=1, N
+    A(i)=0
+  enddo
+! clang-format off
+! CHECK: omptarget device 0 info: OpenMP Host-Device pointer mappings after block
+! CHECK-NEXT: omptarget device 0 info: Host Ptr Target Ptr Size (B) DynRefCount HoldRefCount Declaration
+! CHECK-NEXT: omptarget device 0 info: {{(0x[0-9a-f]{16})}} {{(0x[0-9a-f]{16})}}  20000000 1 0 {{.*}} at a(:n):21:11
+! clang-format on
+!$omp target enter data map(to:A(:N))
+  call ompx_dump_mapping_tables()
+!$omp target parallel do
+  do i=1, N
+    A(i)=A(i)*2
+  enddo
+!$omp target exit data map(from:A)
+end program
diff --git a/offload/test/offloading/ompx_bare_ballot_sync.c b/offload/test/offloading/ompx_bare_ballot_sync.c
new file mode 100644
index 000000000000..d8e17691bf9c
--- /dev/null
+++ b/offload/test/offloading/ompx_bare_ballot_sync.c
@@ -0,0 +1,45 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+//
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#if defined __AMDGCN_WAVEFRONT_SIZE && __AMDGCN_WAVEFRONT_SIZE == 64
+#define MASK 0xaaaaaaaaaaaaaaaa
+#else
+#define MASK 0xaaaaaaaa
+#endif
+
+#include <assert.h>
+#include <ompx.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, char *argv[]) {
+  const int num_blocks = 1;
+  const int block_size = 256;
+  const int N = num_blocks * block_size;
+  uint64_t *data = (uint64_t *)malloc(N * sizeof(uint64_t));
+
+  for (int i = 0; i < N; ++i)
+    data[i] = i & 0x1;
+
+#pragma omp target teams ompx_bare num_teams(num_blocks) thread_limit(block_size) map(tofrom: data[0:N])
+  {
+    int tid = ompx_thread_id_x();
+    uint64_t mask = ompx_ballot_sync(~0U, data[tid]);
+    data[tid] += mask;
+  }
+
+  for (int i = 0; i < N; ++i)
+    assert(data[i] == ((i & 0x1) + MASK));
+
+  // CHECK: PASS
+  printf("PASS\n");
+
+  return 0;
+}
diff --git a/openmp/cmake/OpenMPTesting.cmake b/openmp/cmake/OpenMPTesting.cmake
index ab2348ae59b5..c67ad8b1cbd9 100644
--- a/openmp/cmake/OpenMPTesting.cmake
+++ b/openmp/cmake/OpenMPTesting.cmake
@@ -58,7 +58,7 @@ if (${OPENMP_STANDALONE_BUILD})
     set(DEFAULT_LIT_ARGS "${DEFAULT_LIT_ARGS} --no-progress-bar")
   endif()
   if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
-    set(DEFAULT_LIT_ARGS "${DEFAULT_LIT_ARGS} --time-tests --timeout=1800")
+    set(DEFAULT_LIT_ARGS "${DEFAULT_LIT_ARGS} --time-tests --timeout=3000")
   endif()
   set(OPENMP_LIT_ARGS "${DEFAULT_LIT_ARGS}" CACHE STRING "Options for lit.")
   separate_arguments(OPENMP_LIT_ARGS)
diff --git a/openmp/docs/SupportAndFAQ.rst b/openmp/docs/SupportAndFAQ.rst
index 9e6974dfbb13..a158422befd0 100644
--- a/openmp/docs/SupportAndFAQ.rst
+++ b/openmp/docs/SupportAndFAQ.rst
@@ -454,6 +454,15 @@ Q: What command line options can I use for OpenMP?
 We recommend taking a look at the OpenMP 
 :doc:`command line argument reference <CommandLineArgumentReference>` page.
 
+Q: Can I build the offloading runtimes without CUDA or HSA?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+By default, the offloading runtime will load the associated vendor runtime 
+during initialization rather than directly linking against them. This allows the 
+program to be built and run on many machine. If you wish to directly link 
+against these libraries, use the ``LIBOMPTARGET_DLOPEN_PLUGINS=""`` option to 
+suppress it for each plugin. The default value is every plugin enabled with 
+``LIBOMPTARGET_PLUGINS_TO_BUILD``.
+
 Q: Why is my build taking a long time?
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 When installing OpenMP and other LLVM components, the build time on multicore 
diff --git a/openmp/docs/remarks/OMP121.rst b/openmp/docs/remarks/OMP121.rst
index 88561b8a1fe1..f3ceeac7f3ab 100644
--- a/openmp/docs/remarks/OMP121.rst
+++ b/openmp/docs/remarks/OMP121.rst
@@ -1,6 +1,6 @@
 .. _omp121:
 
-Value has potential side effects preventing SPMD-mode execution. Add `__attribute__((assume(\"ompx_spmd_amenable\")))` to the called function to override. [OMP121]
+Value has potential side effects preventing SPMD-mode execution. Add `[[omp::assume(\"ompx_spmd_amenable\")]]` to the called function to override. [OMP121]
 ===================================================================================================================================================================
 
 This analysis remarks indicates that a potential side-effect that cannot be
@@ -42,7 +42,7 @@ or operations that cannot be executed in SPMD-mode.
 
    $ clang++ -fopenmp -fopenmp-targets=nvptx64 -O2 -Rpass-analysis=openmp-opt omp121.cpp
    omp121.cpp:8:13: remark: Value has potential side effects preventing SPMD-mode
-   execution.  Add `__attribute__((assume("ompx_spmd_amenable")))` to the called function
+   execution.  Add `[[omp::assume("ompx_spmd_amenable")]]` to the called function
    to override. [OMP121]
    int x = work();
             ^
@@ -53,7 +53,7 @@ contain any code that prevents SPMD-mode execution.
 
 .. code-block:: c++
 
-  __attribute__((assume("ompx_spmd_amenable"))) extern int work();
+  [[omp::assume("ompx_spmd_amenable")]] extern int work();
 
   void use(int x);
 
diff --git a/openmp/docs/remarks/OMP133.rst b/openmp/docs/remarks/OMP133.rst
index f025352de105..5a734479d495 100644
--- a/openmp/docs/remarks/OMP133.rst
+++ b/openmp/docs/remarks/OMP133.rst
@@ -1,4 +1,4 @@
-Call may contain unknown parallel regions. Use `__attribute__((assume("omp_no_parallelism")))` to override. [OMP133]
+Call may contain unknown parallel regions. Use `[[omp::assume("omp_no_parallelism")]]` to override. [OMP133]
 ====================================================================================================================
 
 .. _omp133:
@@ -33,7 +33,7 @@ regions. This is typically coupled with the :ref:`OMP132 <omp132>` remark.
 
    $ clang++ -fopenmp -fopenmp-targets=nvptx64 -O2 -Rpass-analysis=openmp-opt omp133.cpp
    omp133.cpp:6:5: remark: Call may contain unknown parallel regions. Use
-   `__attribute__((assume("omp_no_parallelism")))` to override. [OMP133]
+   `[[omp::assume("omp_no_parallelism")]]` to override. [OMP133]
    setup();
    ^
 
@@ -43,7 +43,7 @@ specialized state machine.
 
 .. code-block:: c++
 
-   __attribute__((assume("omp_no_parallelism"))) extern void setup();
+   [[omp::assume("omp_no_parallelism")]] extern void setup();
 
 
    void foo() {
diff --git a/openmp/docs/remarks/OptimizationRemarks.rst b/openmp/docs/remarks/OptimizationRemarks.rst
index a29dce60e073..2c683a4376c4 100644
--- a/openmp/docs/remarks/OptimizationRemarks.rst
+++ b/openmp/docs/remarks/OptimizationRemarks.rst
@@ -81,7 +81,7 @@ OpenMP Remarks
    * - :ref:`OMP121 <omp121>`
      - Analysis
      - Value has potential side effects preventing SPMD-mode execution. Add
-       `__attribute__((assume(\"ompx_spmd_amenable\")))` to the called function
+       `[[omp::assume(\"ompx_spmd_amenable\")]]` to the called function
        to override.
    * - :ref:`OMP130 <omp130>`
      - Optimization
@@ -96,7 +96,7 @@ OpenMP Remarks
    * - :ref:`OMP133 <omp133>`
      - Analysis
      - Call may contain unknown parallel regions. Use
-       `__attribute__((assume("omp_no_parallelism")))` to override.
+       `[[omp::assume("omp_no_parallelism")]]` to override.
    * - :ref:`OMP140 <omp140>`
      - Analysis
      - Could not internalize function. Some optimizations may not be possible.
diff --git a/openmp/runtime/src/include/ompx.h.var b/openmp/runtime/src/include/ompx.h.var
index 579d31aa98c5..19851880c3ac 100644
--- a/openmp/runtime/src/include/ompx.h.var
+++ b/openmp/runtime/src/include/ompx.h.var
@@ -9,6 +9,8 @@
 #ifndef __OMPX_H
 #define __OMPX_H
 
+typedef unsigned long uint64_t;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -81,6 +83,10 @@ _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(void, sync_block_divergent, int Ordering,
 #undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C
 ///}
 
+static inline uint64_t ompx_ballot_sync(uint64_t mask, int pred) {
+  __builtin_trap();
+}
+
 #pragma omp end declare variant
 
 /// ompx_{sync_block}_{,divergent}
@@ -109,6 +115,8 @@ _TGT_KERNEL_LANGUAGE_DECL_GRID_C(grid_dim)
 #undef _TGT_KERNEL_LANGUAGE_DECL_GRID_C
 ///}
 
+uint64_t ompx_ballot_sync(uint64_t mask, int pred);
+
 #ifdef __cplusplus
 }
 #endif
@@ -160,6 +168,10 @@ _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX(void, sync_block_divergent,
 #undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX
 ///}
 
+static inline uint64_t ballot_sync(uint64_t mask, int pred) {
+  return ompx_ballot_sync(mask, pred);
+}
+
 } // namespace ompx
 #endif
 
diff --git a/openmp/runtime/test/lit.cfg b/openmp/runtime/test/lit.cfg
index e8f7f3470580..14c746898213 100644
--- a/openmp/runtime/test/lit.cfg
+++ b/openmp/runtime/test/lit.cfg
@@ -171,10 +171,14 @@ config.substitutions.append(("%libomp-c99-compile-and-run", \
     "%libomp-c99-compile && %libomp-run"))
 config.substitutions.append(("%libomp-cxx-compile-and-run", \
     "%libomp-cxx-compile && %libomp-run"))
+config.substitutions.append(("%libomp-cxx20-compile-and-run", \
+    "%libomp-cxx20-compile && %libomp-run"))
 config.substitutions.append(("%libomp-cxx-compile-c", \
     "%clangXX %openmp_flags %flags -std=c++17 -x c++ %s -o %t" + libs))
 config.substitutions.append(("%libomp-cxx-compile", \
     "%clangXX %openmp_flags %flags -std=c++17 %s -o %t" + libs))
+config.substitutions.append(("%libomp-cxx20-compile", \
+    "%clangXX %openmp_flags %flags -std=c++20 %s -o %t" + libs))
 config.substitutions.append(("%libomp-compile", \
     "%clang %openmp_flags %flags %s -o %t" + libs))
 config.substitutions.append(("%libomp-irbuilder-compile", \
diff --git a/openmp/runtime/test/transform/tile/foreach.cpp b/openmp/runtime/test/transform/tile/foreach.cpp
new file mode 100644
index 000000000000..4fb359576097
--- /dev/null
+++ b/openmp/runtime/test/transform/tile/foreach.cpp
@@ -0,0 +1,228 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator move ctor");
+    }
+
+    ~Iterator() { owner->print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      owner->print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      owner->print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    Iterator &operator++() {
+      owner->print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      owner->print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+
+    void print(const char *msg) const { owner->print(msg); }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+#pragma omp tile sizes(2, 2)
+  for (Reporter c{"C"}; auto &&v : Reporter("A"))
+    for (Reporter d{"D"}; auto &&w : Reporter("B"))
+      printf("v=%d w=%d\n", v, w);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: [C] ctor
+// CHECK-NEXT: [A] ctor
+// CHECK-NEXT: [A] end()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] iterator distance: 3
+// CHECK-NEXT: [D] ctor
+// CHECK-NEXT: [B] ctor
+// CHECK-NEXT: [B] end()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] iterator distance: 3
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: v=0 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: v=0 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: v=1 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: v=1 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: v=0 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: v=1 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: v=2 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: v=2 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: v=2 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] dtor
+// CHECK-NEXT: [D] dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] dtor
+// CHECK-NEXT: [C] dtor
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/tile/iterfor.cpp b/openmp/runtime/test/transform/tile/iterfor.cpp
new file mode 100644
index 000000000000..12613544f6e5
--- /dev/null
+++ b/openmp/runtime/test/transform/tile/iterfor.cpp
@@ -0,0 +1,233 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator move ctor");
+    }
+
+    ~Iterator() { owner->print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      owner->print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      owner->print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    bool operator!=(const Iterator &that) const {
+      owner->print("iterator %d != %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    Iterator &operator++() {
+      owner->print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      owner->print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+  {
+    Reporter A("A"), B("B");
+#pragma omp tile sizes(2, 2)
+    for (auto it = A.begin(); it != A.end(); ++it)
+      for (auto jt = B.begin(); jt != B.end(); ++jt)
+        printf("i=%d j=%d\n", *it, *jt);
+  }
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: [A] ctor
+// CHECK-NEXT: [B] ctor
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] end()
+// CHECK-NEXT: [A] iterator distance: 3
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] end()
+// CHECK-NEXT: [B] iterator distance: 3
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=0 j=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=0 j=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=1 j=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=1 j=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=0 j=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=1 j=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=2 j=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=2 j=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=2 j=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] dtor
+// CHECK-NEXT: [A] dtor
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/tile/parallel-wsloop-collapse-foreach.cpp b/openmp/runtime/test/transform/tile/parallel-wsloop-collapse-foreach.cpp
new file mode 100644
index 000000000000..b1f4d98a52dd
--- /dev/null
+++ b/openmp/runtime/test/transform/tile/parallel-wsloop-collapse-foreach.cpp
@@ -0,0 +1,366 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator move ctor");
+    }
+
+    ~Iterator() { owner->print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      owner->print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      owner->print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    Iterator &operator++() {
+      owner->print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      owner->print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+#pragma omp parallel for collapse(3) num_threads(1)
+  for (int i = 0; i < 3; ++i)
+#pragma omp tile sizes(2, 2)
+    for (Reporter c{"C"}; auto &&v : Reporter("A"))
+      for (Reporter d{"D"}; auto &&w : Reporter("B"))
+        printf("i=%d v=%d w=%d\n", i, v, w);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: [C] ctor
+// CHECK-NEXT: [A] ctor
+// CHECK-NEXT: [A] end()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] begin()
+// CHECK-NEXT: [A] iterator distance: 3
+// CHECK-NEXT: [D] ctor
+// CHECK-NEXT: [B] ctor
+// CHECK-NEXT: [B] end()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] begin()
+// CHECK-NEXT: [B] iterator distance: 3
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=0 v=0 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=0 v=0 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=0 v=1 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=0 v=1 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=0 v=0 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=0 v=1 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=0 v=2 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=0 v=2 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=0 v=2 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=1 v=0 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=1 v=0 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=1 v=1 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=1 v=1 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=1 v=0 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=1 v=1 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=1 v=2 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=1 v=2 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=1 v=2 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=2 v=0 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=2 v=0 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=2 v=1 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=2 v=1 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 0
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 0
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=2 v=0 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 1
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 1
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=2 v=1 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator advance: 0 += 0
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 0
+// CHECK-NEXT: i=2 v=2 w=0
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator advance: 0 += 1
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 1
+// CHECK-NEXT: i=2 v=2 w=1
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator advance: 0 += 2
+// CHECK-NEXT: [A] iterator move assign
+// CHECK-NEXT: [A] iterator deref: 2
+// CHECK-NEXT: [B] iterator advance: 0 += 2
+// CHECK-NEXT: [B] iterator move assign
+// CHECK-NEXT: [B] iterator deref: 2
+// CHECK-NEXT: i=2 v=2 w=2
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] iterator dtor
+// CHECK-NEXT: [B] dtor
+// CHECK-NEXT: [D] dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] iterator dtor
+// CHECK-NEXT: [A] dtor
+// CHECK-NEXT: [C] dtor
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/unroll/factor_foreach.cpp b/openmp/runtime/test/transform/unroll/factor_foreach.cpp
new file mode 100644
index 000000000000..29fef7c18736
--- /dev/null
+++ b/openmp/runtime/test/transform/unroll/factor_foreach.cpp
@@ -0,0 +1,162 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator move ctor");
+    }
+
+    ~Iterator() { owner->print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      owner->print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      owner->print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    bool operator!=(const Iterator &that) const {
+      owner->print("iterator %d != %d", 2 - this->pos, 2 - that.pos);
+      return this->pos != that.pos;
+    }
+
+    Iterator &operator++() {
+      owner->print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      owner->print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+
+    void print(const char *msg) const { owner->print(msg); }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+#pragma omp unroll partial(2)
+  for (Reporter c{"init-stmt"}; auto &&v : Reporter("range"))
+    printf("v=%d\n", v);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: [init-stmt] ctor
+// CHECK-NEXT: [range] ctor
+// CHECK-NEXT: [range] begin()
+// CHECK-NEXT: [range] end()
+// CHECK-NEXT: [range] iterator 0 != 3
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: v=0
+// CHECK-NEXT: [range] iterator prefix ++
+// CHECK-NEXT: [range] iterator 1 != 3
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: v=1
+// CHECK-NEXT: [range] iterator prefix ++
+// CHECK-NEXT: [range] iterator 2 != 3
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: v=2
+// CHECK-NEXT: [range] iterator prefix ++
+// CHECK-NEXT: [range] iterator 3 != 3
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] dtor
+// CHECK-NEXT: [init-stmt] dtor
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/unroll/factor_intfor.c b/openmp/runtime/test/transform/unroll/factor_intfor.c
new file mode 100644
index 000000000000..42ebeb48e41c
--- /dev/null
+++ b/openmp/runtime/test/transform/unroll/factor_intfor.c
@@ -0,0 +1,25 @@
+// RUN: %libomp-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <stdlib.h>
+#include <stdio.h>
+
+int main() {
+  printf("do\n");
+#pragma omp unroll partial(2)
+  for (int i = 7; i < 19; i += 3)
+    printf("i=%d\n", i);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: i=7
+// CHECK-NEXT: i=10
+// CHECK-NEXT: i=13
+// CHECK-NEXT: i=16
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/unroll/factor_iterfor.cpp b/openmp/runtime/test/transform/unroll/factor_iterfor.cpp
new file mode 100644
index 000000000000..0298477110b2
--- /dev/null
+++ b/openmp/runtime/test/transform/unroll/factor_iterfor.cpp
@@ -0,0 +1,169 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      print("iterator move ctor");
+    }
+
+    ~Iterator() { print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    bool operator!=(const Iterator &that) const {
+      owner->print("iterator %d != %d", 2 - this->pos, 2 - that.pos);
+      return this->pos != that.pos;
+    }
+
+    Iterator &operator++() {
+      print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+
+    void print(const char *msg) const { owner->print(msg); }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+  {
+    Reporter range("range");
+#pragma omp unroll partial(2)
+    for (auto it = range.begin(); it != range.end(); ++it)
+      printf("v=%d\n", *it);
+  }
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: [range] ctor
+// CHECK-NEXT: [range] begin()
+// CHECK-NEXT: [range] end()
+// CHECK-NEXT: [range] iterator 0 != 3
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: v=0
+// CHECK-NEXT: [range] iterator prefix ++
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] end()
+// CHECK-NEXT: [range] iterator 1 != 3
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: v=1
+// CHECK-NEXT: [range] iterator prefix ++
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] end()
+// CHECK-NEXT: [range] iterator 2 != 3
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: v=2
+// CHECK-NEXT: [range] iterator prefix ++
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] end()
+// CHECK-NEXT: [range] iterator 3 != 3
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] dtor
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/unroll/factor_parallel-wsloop-collapse-foreach.cpp b/openmp/runtime/test/transform/unroll/factor_parallel-wsloop-collapse-foreach.cpp
new file mode 100644
index 000000000000..71567faf7964
--- /dev/null
+++ b/openmp/runtime/test/transform/unroll/factor_parallel-wsloop-collapse-foreach.cpp
@@ -0,0 +1,199 @@
+// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdarg>
+#include <cstdio>
+#include <vector>
+
+struct Reporter {
+  const char *name;
+
+  Reporter(const char *name) : name(name) { print("ctor"); }
+
+  Reporter() : name("<anon>") { print("ctor"); }
+
+  Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); }
+
+  Reporter(Reporter &&that) : name(that.name) { print("move ctor"); }
+
+  ~Reporter() { print("dtor"); }
+
+  const Reporter &operator=(const Reporter &that) {
+    print("copy assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  const Reporter &operator=(Reporter &&that) {
+    print("move assign");
+    this->name = that.name;
+    return *this;
+  }
+
+  struct Iterator {
+    const Reporter *owner;
+    int pos;
+
+    Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {}
+
+    Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator copy ctor");
+    }
+
+    Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) {
+      owner->print("iterator move ctor");
+    }
+
+    ~Iterator() { owner->print("iterator dtor"); }
+
+    const Iterator &operator=(const Iterator &that) {
+      owner->print("iterator copy assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    const Iterator &operator=(Iterator &&that) {
+      owner->print("iterator move assign");
+      this->owner = that.owner;
+      this->pos = that.pos;
+      return *this;
+    }
+
+    bool operator==(const Iterator &that) const {
+      owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos);
+      return this->pos == that.pos;
+    }
+
+    bool operator!=(const Iterator &that) const {
+      owner->print("iterator %d != %d", 2 - this->pos, 2 - that.pos);
+      return this->pos != that.pos;
+    }
+
+    Iterator &operator++() {
+      owner->print("iterator prefix ++");
+      pos -= 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      owner->print("iterator postfix ++");
+      auto result = *this;
+      pos -= 1;
+      return result;
+    }
+
+    int operator*() const {
+      int result = 2 - pos;
+      owner->print("iterator deref: %i", result);
+      return result;
+    }
+
+    size_t operator-(const Iterator &that) const {
+      int result = (2 - this->pos) - (2 - that.pos);
+      owner->print("iterator distance: %d", result);
+      return result;
+    }
+
+    Iterator operator+(int steps) const {
+      owner->print("iterator advance: %i += %i", 2 - this->pos, steps);
+      return Iterator(owner, pos - steps);
+    }
+
+    void print(const char *msg) const { owner->print(msg); }
+  };
+
+  Iterator begin() const {
+    print("begin()");
+    return Iterator(this, 2);
+  }
+
+  Iterator end() const {
+    print("end()");
+    return Iterator(this, -1);
+  }
+
+  void print(const char *msg, ...) const {
+    va_list args;
+    va_start(args, msg);
+    printf("[%s] ", name);
+    vprintf(msg, args);
+    printf("\n");
+    va_end(args);
+  }
+};
+
+int main() {
+  printf("do\n");
+#pragma omp parallel for collapse(2) num_threads(1)
+  for (int i = 0; i < 3; ++i)
+#pragma omp unroll partial(2)
+    for (Reporter c{"init-stmt"}; auto &&v : Reporter("range"))
+      printf("i=%d v=%d\n", i, v);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: [init-stmt] ctor
+// CHECK-NEXT: [range] ctor
+// CHECK-NEXT: [range] end()
+// CHECK-NEXT: [range] begin()
+// CHECK-NEXT: [range] begin()
+// CHECK-NEXT: [range] iterator distance: 3
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=0 v=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=0 v=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=0 v=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=1 v=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=1 v=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=1 v=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 0
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 0
+// CHECK-NEXT: i=2 v=0
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 1
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 1
+// CHECK-NEXT: i=2 v=1
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator advance: 0 += 2
+// CHECK-NEXT: [range] iterator move assign
+// CHECK-NEXT: [range] iterator deref: 2
+// CHECK-NEXT: i=2 v=2
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] iterator dtor
+// CHECK-NEXT: [range] dtor
+// CHECK-NEXT: [init-stmt] dtor
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/unroll/factor_parallel-wsloop-collapse-intfor.cpp b/openmp/runtime/test/transform/unroll/factor_parallel-wsloop-collapse-intfor.cpp
new file mode 100644
index 000000000000..0a31f8db0701
--- /dev/null
+++ b/openmp/runtime/test/transform/unroll/factor_parallel-wsloop-collapse-intfor.cpp
@@ -0,0 +1,32 @@
+// RUN: %libomp-cxx-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <cstdlib>
+#include <cstdio>
+
+int main() {
+  printf("do\n");
+#pragma omp parallel for collapse(2) num_threads(1)
+  for (int i = 0; i < 3; ++i)
+#pragma omp unroll partial(2)
+    for (int j = 0; j < 3; ++j)
+      printf("i=%d j=%d\n", i, j);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: i=0 j=0
+// CHECK-NEXT: i=0 j=1
+// CHECK-NEXT: i=0 j=2
+// CHECK-NEXT: i=1 j=0
+// CHECK-NEXT: i=1 j=1
+// CHECK-NEXT: i=1 j=2
+// CHECK-NEXT: i=2 j=0
+// CHECK-NEXT: i=2 j=1
+// CHECK-NEXT: i=2 j=2
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/unroll/full_intfor.c b/openmp/runtime/test/transform/unroll/full_intfor.c
new file mode 100644
index 000000000000..081451109176
--- /dev/null
+++ b/openmp/runtime/test/transform/unroll/full_intfor.c
@@ -0,0 +1,25 @@
+// RUN: %libomp-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <stdlib.h>
+#include <stdio.h>
+
+int main() {
+  printf("do\n");
+#pragma omp unroll full
+  for (int i = 7; i < 19; i += 3)
+    printf("i=%d\n", i);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: i=7
+// CHECK-NEXT: i=10
+// CHECK-NEXT: i=13
+// CHECK-NEXT: i=16
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/unroll/heuristic_intfor.c b/openmp/runtime/test/transform/unroll/heuristic_intfor.c
new file mode 100644
index 000000000000..b07bec7d82f0
--- /dev/null
+++ b/openmp/runtime/test/transform/unroll/heuristic_intfor.c
@@ -0,0 +1,25 @@
+// RUN: %libomp-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <stdlib.h>
+#include <stdio.h>
+
+int main() {
+  printf("do\n");
+#pragma omp unroll
+  for (int i = 7; i < 19; i += 3)
+    printf("i=%d\n", i);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: i=7
+// CHECK-NEXT: i=10
+// CHECK-NEXT: i=13
+// CHECK-NEXT: i=16
+// CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/unroll/partial_intfor.c b/openmp/runtime/test/transform/unroll/partial_intfor.c
new file mode 100644
index 000000000000..2ede94e70e12
--- /dev/null
+++ b/openmp/runtime/test/transform/unroll/partial_intfor.c
@@ -0,0 +1,25 @@
+// RUN: %libomp-compile-and-run | FileCheck %s --match-full-lines
+
+#ifndef HEADER
+#define HEADER
+
+#include <stdlib.h>
+#include <stdio.h>
+
+int main() {
+  printf("do\n");
+#pragma omp unroll partial
+  for (int i = 7; i < 19; i += 3)
+    printf("i=%d\n", i);
+  printf("done\n");
+  return EXIT_SUCCESS;
+}
+
+#endif /* HEADER */
+
+// CHECK:      do
+// CHECK-NEXT: i=7
+// CHECK-NEXT: i=10
+// CHECK-NEXT: i=13
+// CHECK-NEXT: i=16
+// CHECK-NEXT: done
diff --git a/polly/test/CodeGen/20100617.ll b/polly/test/CodeGen/20100617.ll
index 71a889f067b8..7229a6e3d524 100644
--- a/polly/test/CodeGen/20100617.ll
+++ b/polly/test/CodeGen/20100617.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @init_array() nounwind {
diff --git a/polly/test/CodeGen/20100622.ll b/polly/test/CodeGen/20100622.ll
index 872d6a0d75cf..bed737741abb 100644
--- a/polly/test/CodeGen/20100622.ll
+++ b/polly/test/CodeGen/20100622.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-codegen -disable-output < %s
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | not FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s | not FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
 
diff --git a/polly/test/CodeGen/20100707.ll b/polly/test/CodeGen/20100707.ll
index 338198084fc7..ee0422e07c4e 100644
--- a/polly/test/CodeGen/20100707.ll
+++ b/polly/test/CodeGen/20100707.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @clause_SetSplitField(i32 %Length) nounwind inlinehint {
diff --git a/polly/test/CodeGen/20100707_2.ll b/polly/test/CodeGen/20100707_2.ll
index df784c6d7957..a4cd76af9dd3 100644
--- a/polly/test/CodeGen/20100707_2.ll
+++ b/polly/test/CodeGen/20100707_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @win193 = external global [4 x [36 x double]], align 32 ; <ptr> [#uses=3]
diff --git a/polly/test/CodeGen/20100708.ll b/polly/test/CodeGen/20100708.ll
index 50b8e385df53..9080451aeae5 100644
--- a/polly/test/CodeGen/20100708.ll
+++ b/polly/test/CodeGen/20100708.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-detect < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define fastcc void @execute() nounwind {
diff --git a/polly/test/CodeGen/20100708_2.ll b/polly/test/CodeGen/20100708_2.ll
index 2f4807d9e4d7..51dc9d311f07 100644
--- a/polly/test/CodeGen/20100708_2.ll
+++ b/polly/test/CodeGen/20100708_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @init_array() nounwind {
diff --git a/polly/test/CodeGen/20100713.ll b/polly/test/CodeGen/20100713.ll
index edd352a4c4cc..a836795c9907 100644
--- a/polly/test/CodeGen/20100713.ll
+++ b/polly/test/CodeGen/20100713.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @fft_float(i32 %NumSamples) nounwind {
diff --git a/polly/test/CodeGen/20100713_2.ll b/polly/test/CodeGen/20100713_2.ll
index 92f8959d91d6..28b984bd5900 100644
--- a/polly/test/CodeGen/20100713_2.ll
+++ b/polly/test/CodeGen/20100713_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define hidden void @luaD_callhook() nounwind {
diff --git a/polly/test/CodeGen/20100717.ll b/polly/test/CodeGen/20100717.ll
index a400eeaa3370..51c453cfe438 100644
--- a/polly/test/CodeGen/20100717.ll
+++ b/polly/test/CodeGen/20100717.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly  -polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly  -passes=polly-codegen -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @matrixTranspose(ptr %A) nounwind {
diff --git a/polly/test/CodeGen/20100718-DomInfo-2.ll b/polly/test/CodeGen/20100718-DomInfo-2.ll
index 512b4c5c99af..fdac75f1b999 100644
--- a/polly/test/CodeGen/20100718-DomInfo-2.ll
+++ b/polly/test/CodeGen/20100718-DomInfo-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -verify-dom-info -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-dom-info -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @getNonAffNeighbour() nounwind {
diff --git a/polly/test/CodeGen/20100718-DomInfo.ll b/polly/test/CodeGen/20100718-DomInfo.ll
index e12334359c33..da68eb0dd8fa 100644
--- a/polly/test/CodeGen/20100718-DomInfo.ll
+++ b/polly/test/CodeGen/20100718-DomInfo.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -verify-dom-info -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-dom-info -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @intrapred_luma_16x16(i32 %predmode) nounwind {
diff --git a/polly/test/CodeGen/20100720-MultipleConditions.ll b/polly/test/CodeGen/20100720-MultipleConditions.ll
index 9f2268713853..3dece4efdcd0 100644
--- a/polly/test/CodeGen/20100720-MultipleConditions.ll
+++ b/polly/test/CodeGen/20100720-MultipleConditions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-ast -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s
 
 ;int bar1();
 ;int bar2();
diff --git a/polly/test/CodeGen/20100809-IndependentBlock.ll b/polly/test/CodeGen/20100809-IndependentBlock.ll
index 8d596689d8ae..f45b6544464d 100644
--- a/polly/test/CodeGen/20100809-IndependentBlock.ll
+++ b/polly/test/CodeGen/20100809-IndependentBlock.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 define void @cfft2(ptr %x) nounwind {
 entry:
diff --git a/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll b/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll
index 261a205560b5..82da9d248642 100644
--- a/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll
+++ b/polly/test/CodeGen/20100811-ScalarDependencyBetweenBrAndCnd.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/CodeGen/20101030-Overflow.ll b/polly/test/CodeGen/20101030-Overflow.ll
index caaa4851f93e..fecdb9d4fed1 100644
--- a/polly/test/CodeGen/20101030-Overflow.ll
+++ b/polly/test/CodeGen/20101030-Overflow.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @compdecomp() nounwind {
diff --git a/polly/test/CodeGen/20101103-Overflow3.ll b/polly/test/CodeGen/20101103-Overflow3.ll
index b2faf14fba0b..f1503e25fcc4 100644
--- a/polly/test/CodeGen/20101103-Overflow3.ll
+++ b/polly/test/CodeGen/20101103-Overflow3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 define void @Reflection_coefficients(ptr %r) nounwind {
 bb20:
diff --git a/polly/test/CodeGen/20101103-signmissmatch.ll b/polly/test/CodeGen/20101103-signmissmatch.ll
index e157d292dc8a..3d0c929446f4 100644
--- a/polly/test/CodeGen/20101103-signmissmatch.ll
+++ b/polly/test/CodeGen/20101103-signmissmatch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @CleanNet() nounwind {
diff --git a/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll b/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll
index c792d8c3d0bf..0e62e678f0ae 100644
--- a/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll
+++ b/polly/test/CodeGen/20110226-Ignore-Dead-Code.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @main() nounwind {
diff --git a/polly/test/CodeGen/20110226-PHI-Node-removed.ll b/polly/test/CodeGen/20110226-PHI-Node-removed.ll
index 3458d75c47a0..32b018f24e54 100644
--- a/polly/test/CodeGen/20110226-PHI-Node-removed.ll
+++ b/polly/test/CodeGen/20110226-PHI-Node-removed.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/CodeGen/20120316-InvalidCast.ll b/polly/test/CodeGen/20120316-InvalidCast.ll
index 8355cc51c468..b87a3dc60dea 100644
--- a/polly/test/CodeGen/20120316-InvalidCast.ll
+++ b/polly/test/CodeGen/20120316-InvalidCast.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 
 ; CHECK: polly.start
 
diff --git a/polly/test/CodeGen/20120403-RHS-type-mismatch.ll b/polly/test/CodeGen/20120403-RHS-type-mismatch.ll
index 1d629e388452..dac78bf04a25 100644
--- a/polly/test/CodeGen/20120403-RHS-type-mismatch.ll
+++ b/polly/test/CodeGen/20120403-RHS-type-mismatch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 
 ; We just check that this compilation does not crash.
 
diff --git a/polly/test/CodeGen/20130221.ll b/polly/test/CodeGen/20130221.ll
index 45414671081a..5728a768a3b3 100644
--- a/polly/test/CodeGen/20130221.ll
+++ b/polly/test/CodeGen/20130221.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 define void @list_sequence(ptr %A) {
diff --git a/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll b/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll
index d54be5c3f35f..cafd68e50825 100644
--- a/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll
+++ b/polly/test/CodeGen/20150328-SCEVExpanderIntroducesNewIV.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/Intrinsics/llvm-expect.ll b/polly/test/CodeGen/Intrinsics/llvm-expect.ll
index 84057e276521..47fd4f07e467 100644
--- a/polly/test/CodeGen/Intrinsics/llvm-expect.ll
+++ b/polly/test/CodeGen/Intrinsics/llvm-expect.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; Check that we generate code without crashing.
 ;
diff --git a/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll b/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll
index b04319550938..28531244421d 100644
--- a/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll
+++ b/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll
@@ -1,6 +1,6 @@
 ; This test checks that we do not accidently mutate the debug info when
 ; inserting loop parallel metadata.
-; RUN: opt %loadPolly < %s  -S -polly -polly-codegen -polly-ast-detect-parallel | FileCheck %s
+; RUN: opt %loadNPMPolly < %s  -S -polly -passes=polly-codegen -polly-ast-detect-parallel | FileCheck %s
 ; CHECK-NOT: !7 = !{!7}
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll b/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll
index 7b131c5ebcbd..9bb086fa79ae 100644
--- a/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll
+++ b/polly/test/CodeGen/LoopParallelMD/loop_nest_param_parallel.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-ast-detect-parallel -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ast-detect-parallel -S < %s | FileCheck %s
 ;
 ; Check that we mark multiple parallel loops correctly including the memory instructions.
 ;
diff --git a/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll b/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll
index ec927acb1ec7..96b50cef179a 100644
--- a/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll
+++ b/polly/test/CodeGen/LoopParallelMD/single_loop_param_parallel.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s -check-prefix=SEQUENTIAL
-; RUN: opt %loadPolly -polly-codegen -polly-ast-detect-parallel -S < %s | FileCheck %s -check-prefix=PARALLEL
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=SEQUENTIAL
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ast-detect-parallel -S < %s | FileCheck %s -check-prefix=PARALLEL
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; This is a trivially parallel loop. We just use it to ensure that we actually
diff --git a/polly/test/CodeGen/MemAccess/bad_alignment.ll b/polly/test/CodeGen/MemAccess/bad_alignment.ll
index 32f3cfe963b7..82fff27dd0eb 100644
--- a/polly/test/CodeGen/MemAccess/bad_alignment.ll
+++ b/polly/test/CodeGen/MemAccess/bad_alignment.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -disable-output 2>&1 < %s | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -disable-output 2>&1 < %s | FileCheck %s
 ;
 ; Check that we do not allow to access elements not accessed before because the
 ; alignment information would become invalid.
diff --git a/polly/test/CodeGen/MemAccess/codegen_address_space.ll b/polly/test/CodeGen/MemAccess/codegen_address_space.ll
index 7c9b12d64f9c..3ce363e8f09f 100644
--- a/polly/test/CodeGen/MemAccess/codegen_address_space.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_address_space.ll
@@ -1,4 +1,4 @@
-;RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen < %s -S | FileCheck %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s
 
 ;int A[100];
 ;
diff --git a/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll b/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll
index e008a789fe7d..0563ca87eef5 100644
--- a/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_constant_offset.ll
@@ -1,4 +1,4 @@
-;RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen < %s -S | FileCheck %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s
 
 ;int A[100];
 ;
diff --git a/polly/test/CodeGen/MemAccess/codegen_simple.ll b/polly/test/CodeGen/MemAccess/codegen_simple.ll
index 5ba6f3269fb9..ee0187fe97d2 100644
--- a/polly/test/CodeGen/MemAccess/codegen_simple.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_simple.ll
@@ -1,4 +1,4 @@
-;RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen < %s -S | FileCheck %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s
 
 ;int A[100];
 ;
diff --git a/polly/test/CodeGen/MemAccess/codegen_simple_float.ll b/polly/test/CodeGen/MemAccess/codegen_simple_float.ll
index cf8913fc5197..6970565bf023 100644
--- a/polly/test/CodeGen/MemAccess/codegen_simple_float.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_simple_float.ll
@@ -1,4 +1,4 @@
-;RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen < %s -S | FileCheck %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed < %s -S | FileCheck %s
 ;
 ;float A[100];
 ;
diff --git a/polly/test/CodeGen/MemAccess/codegen_simple_md.ll b/polly/test/CodeGen/MemAccess/codegen_simple_md.ll
index e4afcc8d2243..f0896e2bf609 100644
--- a/polly/test/CodeGen/MemAccess/codegen_simple_md.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_simple_md.ll
@@ -1,5 +1,5 @@
-;RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed+withconst -polly-codegen < %s -S | FileCheck -check-prefix=WITHCONST %s
-;RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed+withoutconst -polly-codegen < %s -S | FileCheck -check-prefix=WITHOUTCONST %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withconst < %s -S | FileCheck -check-prefix=WITHCONST %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withoutconst < %s -S | FileCheck -check-prefix=WITHOUTCONST %s
 
 ;int A[1040];
 ;
diff --git a/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll b/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll
index c9913f3ed873..99fc36996f08 100644
--- a/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll
+++ b/polly/test/CodeGen/MemAccess/codegen_simple_md_float.ll
@@ -1,5 +1,5 @@
-;RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed+withconst -polly-codegen < %s -S | FileCheck -check-prefix=WITHCONST %s
-;RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed+withoutconst -polly-codegen < %s -S | FileCheck -check-prefix=WITHOUTCONST %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withconst < %s -S | FileCheck -check-prefix=WITHCONST %s
+;RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed+withoutconst < %s -S | FileCheck -check-prefix=WITHOUTCONST %s
 ;
 ;float A[1040];
 ;
diff --git a/polly/test/CodeGen/MemAccess/different_types.ll b/polly/test/CodeGen/MemAccess/different_types.ll
index 624de62911ff..53718194c25a 100644
--- a/polly/test/CodeGen/MemAccess/different_types.ll
+++ b/polly/test/CodeGen/MemAccess/different_types.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-import-jscop \
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
 ; RUN: \
-; RUN: -polly-codegen -S < %s | FileCheck %s
+; RUN: -S < %s | FileCheck %s
 ;
 ;    void foo(float A[], float B[]) {
 ;      for (long i = 0; i < 100; i++)
diff --git a/polly/test/CodeGen/MemAccess/generate-all.ll b/polly/test/CodeGen/MemAccess/generate-all.ll
index 6f92ba13587e..d1f695d436da 100644
--- a/polly/test/CodeGen/MemAccess/generate-all.ll
+++ b/polly/test/CodeGen/MemAccess/generate-all.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-codegen -polly-codegen-generate-expressions=false \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-generate-expressions=false \
 ; RUN:     -S < %s | FileCheck %s -check-prefix=SCEV
-; RUN: opt %loadPolly -polly-codegen -polly-codegen-generate-expressions=true \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-generate-expressions=true \
 ; RUN:     -S < %s | FileCheck %s -check-prefix=ASTEXPR
 ;
 ;    void foo(float A[]) {
diff --git a/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll b/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll
index a6d1de0aac63..5c926ac63841 100644
--- a/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll
+++ b/polly/test/CodeGen/MemAccess/invariant_base_ptr.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-import-jscop \
-; RUN:   -polly-codegen -polly-invariant-load-hoisting -S \
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
+; RUN:   -polly-invariant-load-hoisting -S \
 ; RUN:   2>&1 < %s | FileCheck %s
 
 ; Setting new access functions where the base pointer of the array that is newly
diff --git a/polly/test/CodeGen/MemAccess/multiple_types.ll b/polly/test/CodeGen/MemAccess/multiple_types.ll
index 1793bd30fc5b..7848977ce031 100644
--- a/polly/test/CodeGen/MemAccess/multiple_types.ll
+++ b/polly/test/CodeGen/MemAccess/multiple_types.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop \
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,polly-codegen' \
 ; RUN: -polly-allow-differing-element-types \
-; RUN:   -polly-codegen -S    < %s | FileCheck %s
+; RUN:   -S    < %s | FileCheck %s
 ;
 ;    // Check that accessing one array with different types works.
 ;    void multiple_types(char *Short, char *Float, char *Double) {
diff --git a/polly/test/CodeGen/MemAccess/simple.ll b/polly/test/CodeGen/MemAccess/simple.ll
index 39e8a2c91b79..5077e1a1b5a2 100644
--- a/polly/test/CodeGen/MemAccess/simple.ll
+++ b/polly/test/CodeGen/MemAccess/simple.ll
@@ -1,4 +1,4 @@
-;RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -stats < %s 2>&1  | FileCheck %s
+;RUN: opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -stats < %s 2>&1  | FileCheck %s
 ; REQUIRES: asserts
 
 ;int A[100];
diff --git a/polly/test/CodeGen/MemAccess/update_access_functions.ll b/polly/test/CodeGen/MemAccess/update_access_functions.ll
index 05d208708a36..51fa97adb3c3 100644
--- a/polly/test/CodeGen/MemAccess/update_access_functions.ll
+++ b/polly/test/CodeGen/MemAccess/update_access_functions.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-import-jscop \
-; RUN:                -polly-import-jscop-postfix=transformed -polly-codegen \
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
+; RUN:                -polly-import-jscop-postfix=transformed \
 ; RUN:                 < %s -S | FileCheck %s
 
 ; CHECK-LABEL: polly.stmt.loop1:
diff --git a/polly/test/CodeGen/OpenMP/alias-metadata.ll b/polly/test/CodeGen/OpenMP/alias-metadata.ll
index 07d79631b2cb..b80b18f43326 100644
--- a/polly/test/CodeGen/OpenMP/alias-metadata.ll
+++ b/polly/test/CodeGen/OpenMP/alias-metadata.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-parallel -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -S < %s | FileCheck %s
 ;
 ;    void foo(float *A, float *B) {
 ;      for (long i = 0; i < 1000; i++)
diff --git a/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll b/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll
index eb9dfcd9e920..9eb7f5f2a5e9 100644
--- a/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll
+++ b/polly/test/CodeGen/OpenMP/floord-as-argument-to-subfunction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-opt-max-coefficient=-1 -polly-parallel -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-opt-max-coefficient=-1 -polly-parallel -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Check that we do not crash but generate parallel code
 ;
diff --git a/polly/test/CodeGen/OpenMP/inlineasm.ll b/polly/test/CodeGen/OpenMP/inlineasm.ll
index 69b1b0aa53f3..82a73780886e 100644
--- a/polly/test/CodeGen/OpenMP/inlineasm.ll
+++ b/polly/test/CodeGen/OpenMP/inlineasm.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-parallel -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,polly-codegen' -polly-parallel -S < %s | FileCheck %s
 ; llvm.org/PR51960
 
 ; CHECK-LABEL: define internal void @foo_polly_subfn
diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll
index 30beef5b0709..b4c61d197b42 100644
--- a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll
+++ b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
 ; RUN: -polly-parallel-force -S < %s | FileCheck %s
 ;
 ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction.
diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll
index fe5d2ab8c96d..8cf6148a7b44 100644
--- a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll
+++ b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_different_bb.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
 ; RUN: -polly-parallel-force -S < %s | FileCheck %s
 ;
 ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction.
diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll
index 49b9321c40b8..823e5cab55ab 100644
--- a/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll
+++ b/polly/test/CodeGen/OpenMP/invariant_base_pointer_preloaded_pass_only_needed.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
 ; RUN: -polly-parallel-force -S < %s | FileCheck %s
 ;
 ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction but
diff --git a/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll b/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll
index 06c4cdab45f1..5557839e715e 100644
--- a/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll
+++ b/polly/test/CodeGen/OpenMP/invariant_base_pointers_preloaded.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-parallel \
 ; RUN: -polly-parallel-force -S < %s | FileCheck %s
 ;
 ; Test to verify that we hand down the preloaded A[0] to the OpenMP subfunction.
diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll
index db58c3ab7593..a987fac31b74 100644
--- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll
+++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-iv.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; This code has failed the scev based code generation as the scev in the scop
 ; contains an AddRecExpr of an outer loop. When generating code, we did not
diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll
index c2ddc1e26496..b81e120f8c22 100644
--- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll
+++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; AST: #pragma simd
 ; AST: #pragma omp parallel for
diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll
index 0f025bb94112..c4ad665c7b6c 100644
--- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll
+++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values-3.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadPolly -basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true -polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-parallel -polly-parallel-force -polly-invariant-load-hoisting=true -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; The interesting part of this test case is the instruction:
 ;   %tmp = bitcast i8* %call to i64**
diff --git a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll
index f9612d77533d..07aae42335b6 100644
--- a/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll
+++ b/polly/test/CodeGen/OpenMP/loop-body-references-outer-values.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -S < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=IR
 
 ; Make sure we correctly forward the reference to 'A' to the OpenMP subfunction.
 ;
diff --git a/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll b/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll
index da9da18c89b2..27e1bdd2dfbd 100644
--- a/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll
+++ b/polly/test/CodeGen/OpenMP/loop-bounds-reference-outer-ids.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-parallel -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadPolly -polly-parallel -polly-codegen -S < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=IR
 ;
 ; float A[100];
 ;
diff --git a/polly/test/CodeGen/OpenMP/mapped-phi-access.ll b/polly/test/CodeGen/OpenMP/mapped-phi-access.ll
index 1b8433693abf..ac78b4e6c0c5 100644
--- a/polly/test/CodeGen/OpenMP/mapped-phi-access.ll
+++ b/polly/test/CodeGen/OpenMP/mapped-phi-access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-parallel -polly-delicm -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-parallel '-passes=polly-delicm,polly-codegen' -S < %s | FileCheck %s
 ;
 ; Verify that -polly-parallel can handle mapped scalar MemoryAccesses.
 ;
diff --git a/polly/test/CodeGen/OpenMP/matmul-parallel.ll b/polly/test/CodeGen/OpenMP/matmul-parallel.ll
index 5ee9a7c7a824..43326b29f7ef 100644
--- a/polly/test/CodeGen/OpenMP/matmul-parallel.ll
+++ b/polly/test/CodeGen/OpenMP/matmul-parallel.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-parallel -polly-opt-isl -polly-ast -disable-output -debug-only=polly-ast < %s 2>&1 | FileCheck --check-prefix=AST %s
-; RUN: opt %loadPolly -polly-parallel -polly-opt-isl -polly-codegen -S < %s | FileCheck --check-prefix=CODEGEN %s
+; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly-opt-isl,print<polly-ast>' -disable-output -debug-only=polly-ast < %s 2>&1 | FileCheck --check-prefix=AST %s
+; RUN: opt %loadNPMPolly -polly-parallel '-passes=polly-opt-isl,polly-codegen' -S < %s | FileCheck --check-prefix=CODEGEN %s
 ; REQUIRES: asserts
 
 ; Parallelization of detected matrix-multiplication.
diff --git a/polly/test/CodeGen/OpenMP/recomputed-srem.ll b/polly/test/CodeGen/OpenMP/recomputed-srem.ll
index cfae8e943cf1..67db35ae2ca2 100644
--- a/polly/test/CodeGen/OpenMP/recomputed-srem.ll
+++ b/polly/test/CodeGen/OpenMP/recomputed-srem.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-codegen -polly-parallel \
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-codegen -polly-parallel \
 ; RUN: -polly-parallel-force -S < %s | FileCheck %s
 ;
 ; Test to verify that we pass %rem96 to the parallel subfunction.
diff --git a/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll b/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll
index f243c3a04949..96dc4250cd05 100644
--- a/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll
+++ b/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll
@@ -1,15 +1,15 @@
-; RUN: opt %loadPolly -polly-parallel \
-; RUN: -polly-parallel-force -polly-codegen \
+; RUN: opt %loadNPMPolly -polly-parallel \
+; RUN: -polly-parallel-force -passes=polly-codegen \
 ; RUN: -S -verify-dom-info < %s \
 ; RUN: | FileCheck %s -check-prefix=IR
 
-; RUN: opt %loadPolly -polly-parallel \
-; RUN: -polly-parallel-force -polly-codegen -polly-scheduling=runtime \
+; RUN: opt %loadNPMPolly -polly-parallel \
+; RUN: -polly-parallel-force -passes=polly-codegen -polly-scheduling=runtime \
 ; RUN: -S -verify-dom-info < %s \
 ; RUN: | FileCheck %s -check-prefix=IR
 
-; RUN: opt %loadPolly -polly-parallel \
-; RUN: -polly-parallel-force -polly-codegen -polly-omp-backend=LLVM \
+; RUN: opt %loadNPMPolly -polly-parallel \
+; RUN: -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM \
 ; RUN: -S -verify-dom-info < %s \
 ; RUN: | FileCheck %s -check-prefix=LIBOMP-IR
 
diff --git a/polly/test/CodeGen/OpenMP/reference-other-bb.ll b/polly/test/CodeGen/OpenMP/reference-other-bb.ll
index b7abdc23d258..dbfbd9a90508 100644
--- a/polly/test/CodeGen/OpenMP/reference-other-bb.ll
+++ b/polly/test/CodeGen/OpenMP/reference-other-bb.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; IR: @foo_polly_subfn
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll b/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll
index b88589f39a6f..ee43b8aa34a4 100644
--- a/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll
+++ b/polly/test/CodeGen/OpenMP/reference-preceeding-loop.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 
 ; - Test the case where scalar evolution references a loop that is outside
diff --git a/polly/test/CodeGen/OpenMP/reference_latest.ll b/polly/test/CodeGen/OpenMP/reference_latest.ll
index 54875c2630f0..7a8cd77bb157 100644
--- a/polly/test/CodeGen/OpenMP/reference_latest.ll
+++ b/polly/test/CodeGen/OpenMP/reference_latest.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-delicm -polly-simplify -polly-parallel -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-delicm,polly-simplify,polly-codegen' -polly-parallel -S < %s | FileCheck %s
 ;
 ; Test that parallel codegen handles scalars mapped to other arrays.
 ; After mapping "store double %add10" references the array "MemRef2".
diff --git a/polly/test/CodeGen/OpenMP/scev-rewriting.ll b/polly/test/CodeGen/OpenMP/scev-rewriting.ll
index 1b229fc19d25..9b79f2909448 100644
--- a/polly/test/CodeGen/OpenMP/scev-rewriting.ll
+++ b/polly/test/CodeGen/OpenMP/scev-rewriting.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly < %s -polly-vectorizer=stripmine -polly-parallel -polly-parallel-force -polly-process-unprofitable -polly-codegen -S | FileCheck %s
+; RUN: opt %loadNPMPolly < %s -polly-vectorizer=stripmine -polly-parallel -polly-parallel-force -polly-process-unprofitable -passes=polly-codegen -S | FileCheck %s
 ; CHECK: define internal void @DoStringSort_polly_subfn
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnueabi"
diff --git a/polly/test/CodeGen/OpenMP/single_loop.ll b/polly/test/CodeGen/OpenMP/single_loop.ll
index f79653a08d21..e5aee840ade7 100644
--- a/polly/test/CodeGen/OpenMP/single_loop.ll
+++ b/polly/test/CodeGen/OpenMP/single_loop.ll
@@ -1,14 +1,14 @@
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-import-jscop -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST-STRIDE4
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-import-jscop -polly-codegen -S < %s | FileCheck %s -check-prefix=IR-STRIDE4
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-import-jscop,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST-STRIDE4
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-import-jscop,polly-codegen' -S < %s | FileCheck %s -check-prefix=IR-STRIDE4
 
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -polly-omp-backend=LLVM -polly-scheduling=static -polly-scheduling-chunksize=43 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC-CHUNKED
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -polly-omp-backend=LLVM -polly-scheduling=static -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -polly-omp-backend=LLVM -polly-scheduling=dynamic -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -polly-omp-backend=LLVM -polly-scheduling=dynamic -polly-scheduling-chunksize=4 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC-FOUR
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-import-jscop -polly-codegen -polly-omp-backend=LLVM -S < %s | FileCheck %s -check-prefix=LIBOMP-IR-STRIDE4
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=static -polly-scheduling-chunksize=43 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC-CHUNKED
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=static -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-STATIC
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=dynamic -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM -polly-scheduling=dynamic -polly-scheduling-chunksize=4 -S -verify-dom-info < %s | FileCheck %s -check-prefix=LIBOMP-IR-DYNAMIC-FOUR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=polly-import-jscop,polly-codegen' -polly-omp-backend=LLVM -S < %s | FileCheck %s -check-prefix=LIBOMP-IR-STRIDE4
 
 ; This extensive test case tests the creation of the full set of OpenMP calls
 ; as well as the subfunction creation using a trivial loop as example.
diff --git a/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll b/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll
index 50da5dd2b7c0..c519bfdee7a5 100644
--- a/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll
+++ b/polly/test/CodeGen/OpenMP/single_loop_with_loop_invariant_baseptr.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadPolly -tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true -polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -polly-parallel -polly-parallel-force -polly-parallel-force -polly-invariant-load-hoisting=true -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; #define N 1024
 ; float A[N];
diff --git a/polly/test/CodeGen/OpenMP/single_loop_with_param.ll b/polly/test/CodeGen/OpenMP/single_loop_with_param.ll
index d01b7a2fdcad..f6dfd62d6bcc 100644
--- a/polly/test/CodeGen/OpenMP/single_loop_with_param.ll
+++ b/polly/test/CodeGen/OpenMP/single_loop_with_param.ll
@@ -1,15 +1,15 @@
-; RUN: opt %loadPolly -polly-parallel \
-; RUN: -polly-parallel-force -polly-codegen \
+; RUN: opt %loadNPMPolly -polly-parallel \
+; RUN: -polly-parallel-force -passes=polly-codegen \
 ; RUN: -S -verify-dom-info < %s \
 ; RUN: | FileCheck %s -check-prefix=IR
 
-; RUN: opt %loadPolly -polly-parallel \
-; RUN: -polly-parallel-force -polly-codegen -polly-omp-backend=LLVM \
+; RUN: opt %loadNPMPolly -polly-parallel \
+; RUN: -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM \
 ; RUN: -S -verify-dom-info < %s \
 ; RUN: | FileCheck %s -check-prefix=LIBOMP-IR
 
-; RUN: opt %loadPolly -polly-parallel \
-; RUN: -polly-parallel-force -polly-codegen -polly-omp-backend=LLVM \
+; RUN: opt %loadNPMPolly -polly-parallel \
+; RUN: -polly-parallel-force -passes=polly-codegen -polly-omp-backend=LLVM \
 ; RUN: -polly-scheduling=static \
 ; RUN: -S -verify-dom-info < %s \
 ; RUN: | FileCheck %s -check-prefix=LIBOMP-STATIC-IR
diff --git a/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll b/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll
index 05c6ed177e9c..934e04461f13 100644
--- a/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll
+++ b/polly/test/CodeGen/OpenMP/two-parallel-loops-reference-outer-indvar.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=AST
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s -check-prefix=IR
 
 ; This test case verifies that we create correct code even if two OpenMP loops
 ; share common outer variables.
diff --git a/polly/test/CodeGen/PHIInExit.ll b/polly/test/CodeGen/PHIInExit.ll
index eadd6054386b..3e0c9d67d5ca 100644
--- a/polly/test/CodeGen/PHIInExit.ll
+++ b/polly/test/CodeGen/PHIInExit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 %struct..0__pthread_mutex_s = type { i32, i32, i32, i32, i32, i32, %struct.__pthread_list_t }
diff --git a/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll b/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll
index 84827dd26049..76b2fa9a35b2 100644
--- a/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll
+++ b/polly/test/CodeGen/RuntimeDebugBuilder/combine_different_values.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
 ; RUN: -polly-codegen-add-debug-printing \
 ; RUN: -polly-ignore-aliasing < %s | FileCheck %s
 
diff --git a/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll b/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll
index 822eccc306ef..4ffb7fd6e462 100644
--- a/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll
+++ b/polly/test/CodeGen/RuntimeDebugBuilder/stmt_tracing.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen-trace-stmts -polly-codegen-trace-scalars -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-codegen-trace-stmts -polly-codegen-trace-scalars -passes=polly-codegen -S < %s | FileCheck %s
 ;
 
 define void @func(i32 %n, ptr %A) {
diff --git a/polly/test/CodeGen/alias-check-multi-dim.ll b/polly/test/CodeGen/alias-check-multi-dim.ll
index d923a4cc14fd..0440bda74b39 100644
--- a/polly/test/CodeGen/alias-check-multi-dim.ll
+++ b/polly/test/CodeGen/alias-check-multi-dim.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen \
+; RUN: opt %loadNPMPolly -passes=polly-codegen \
 ; RUN:     -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/alias_metadata_too_many_arrays.ll b/polly/test/CodeGen/alias_metadata_too_many_arrays.ll
index 7c5ca012a378..68c17a807e8e 100644
--- a/polly/test/CodeGen/alias_metadata_too_many_arrays.ll
+++ b/polly/test/CodeGen/alias_metadata_too_many_arrays.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-ignore-aliasing -S < %s \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ignore-aliasing -S < %s \
 ; RUN:   | FileCheck %s
 ;
 ;    void manyarrays(float A1[], float A2[], float A3[], float A4[], float A5[],
diff --git a/polly/test/CodeGen/aliasing_different_base_and_access_type.ll b/polly/test/CodeGen/aliasing_different_base_and_access_type.ll
index a087414b8403..8e1fc3b32835 100644
--- a/polly/test/CodeGen/aliasing_different_base_and_access_type.ll
+++ b/polly/test/CodeGen/aliasing_different_base_and_access_type.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; We have to cast %B to "short *" before we create RTCs.
 ;
diff --git a/polly/test/CodeGen/aliasing_different_pointer_types.ll b/polly/test/CodeGen/aliasing_different_pointer_types.ll
index 91f5eab6b2a6..e601c22b978d 100644
--- a/polly/test/CodeGen/aliasing_different_pointer_types.ll
+++ b/polly/test/CodeGen/aliasing_different_pointer_types.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Check that we cast the different pointer types correctly before we compare
 ; them in the RTC's. We use i8* as max pointer type.
diff --git a/polly/test/CodeGen/aliasing_multidimensional_access.ll b/polly/test/CodeGen/aliasing_multidimensional_access.ll
index 48768399e850..e1dae03280a0 100644
--- a/polly/test/CodeGen/aliasing_multidimensional_access.ll
+++ b/polly/test/CodeGen/aliasing_multidimensional_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; Check that we calculate the maximal access into array A correctly and track the overflow state.
 ;
diff --git a/polly/test/CodeGen/aliasing_parametric_simple_1.ll b/polly/test/CodeGen/aliasing_parametric_simple_1.ll
index 5422da4426e9..a79ba2532535 100644
--- a/polly/test/CodeGen/aliasing_parametric_simple_1.ll
+++ b/polly/test/CodeGen/aliasing_parametric_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ;    void jd(int *A, int *B, int c) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/CodeGen/aliasing_parametric_simple_2.ll b/polly/test/CodeGen/aliasing_parametric_simple_2.ll
index de945d403f92..efe4af1c9e7c 100644
--- a/polly/test/CodeGen/aliasing_parametric_simple_2.ll
+++ b/polly/test/CodeGen/aliasing_parametric_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ;    void jd(int *A, int *B, int c) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/CodeGen/aliasing_struct_element.ll b/polly/test/CodeGen/aliasing_struct_element.ll
index 2219ca9d28bb..3079e58d7dab 100644
--- a/polly/test/CodeGen/aliasing_struct_element.ll
+++ b/polly/test/CodeGen/aliasing_struct_element.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; We should only access (or compute the address of) "the first element" of %S
 ; as it is a single struct not a struct array. The maximal access to S, thus
diff --git a/polly/test/CodeGen/alignment.ll b/polly/test/CodeGen/alignment.ll
index a94b1f7e2883..e0f6a959476f 100644
--- a/polly/test/CodeGen/alignment.ll
+++ b/polly/test/CodeGen/alignment.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Check that the special alignment information is kept
 ;
diff --git a/polly/test/CodeGen/annotated_alias_scopes.ll b/polly/test/CodeGen/annotated_alias_scopes.ll
index f8d14cd34b62..b1777a1b5f5d 100644
--- a/polly/test/CodeGen/annotated_alias_scopes.ll
+++ b/polly/test/CodeGen/annotated_alias_scopes.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s --check-prefix=SCOPES
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=SCOPES
 ;
 ; Check that we create alias scopes that indicate the accesses to A, B and C cannot alias in any way.
 ;
diff --git a/polly/test/CodeGen/blas_sscal_simplified.ll b/polly/test/CodeGen/blas_sscal_simplified.ll
index a370fcff46f8..99f2eae9dd8e 100644
--- a/polly/test/CodeGen/blas_sscal_simplified.ll
+++ b/polly/test/CodeGen/blas_sscal_simplified.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 ;
 ; Regression test for a bug in the runtime check generation.
 
diff --git a/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll b/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll
index e0f8c435879a..5dba93373b70 100644
--- a/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll
+++ b/polly/test/CodeGen/conflict-between-loop-invariant-code-hosting-and-escape-map-computation.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -disable-output < %s
 ;
 ; CHECK: store i32 %tmp14_p_scalar_, ptr %tmp14.s2a
 ; CHECK: %tmp14.final_reload = load i32, ptr %tmp14.s2a
diff --git a/polly/test/CodeGen/constant_condition.ll b/polly/test/CodeGen/constant_condition.ll
index dad1f6cffd17..905aa52df508 100644
--- a/polly/test/CodeGen/constant_condition.ll
+++ b/polly/test/CodeGen/constant_condition.ll
@@ -1,4 +1,4 @@
-;RUN: opt %loadPolly -polly-prepare -polly-print-ast -disable-output < %s | FileCheck %s
+;RUN: opt %loadNPMPolly '-passes=polly-prepare,scop(print<polly-ast>)' -disable-output < %s 2>&1 | FileCheck %s
 
 ;#include <string.h>
 ;int A[1];
diff --git a/polly/test/CodeGen/create-conditional-scop.ll b/polly/test/CodeGen/create-conditional-scop.ll
index f51a2dcc9b3c..b8c9a81b71a9 100644
--- a/polly/test/CodeGen/create-conditional-scop.ll
+++ b/polly/test/CodeGen/create-conditional-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-codegen -verify-loop-info < %s -S | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -verify-loop-info < %s -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
 
diff --git a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll
index 991e3c83eef1..6ffe6bf67d54 100644
--- a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll
+++ b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 ;
 ; Check we do not crash even though the dead %tmp8 is referenced by a parameter
 ; and we do not pre-load it (as it is dead).
diff --git a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll
index 153f6912cea5..68c247a60831 100644
--- a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll
+++ b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 ;
 ; Check we do not crash even though there is a dead load that is referenced by
 ; a parameter and we do not pre-load it (as it is dead).
diff --git a/polly/test/CodeGen/debug-intrinsics.ll b/polly/test/CodeGen/debug-intrinsics.ll
index 2feeb7c838b0..25c63da4891c 100644
--- a/polly/test/CodeGen/debug-intrinsics.ll
+++ b/polly/test/CodeGen/debug-intrinsics.ll
@@ -1,9 +1,9 @@
-; RUN: opt %loadPolly \
-; RUN: -polly-analyze-read-only-scalars=false -polly-codegen -S < %s | \
+; RUN: opt %loadNPMPolly \
+; RUN: -polly-analyze-read-only-scalars=false -passes=polly-codegen -S < %s | \
 ; RUN: FileCheck %s
 
-; RUN: opt %loadPolly \
-; RUN: -polly-analyze-read-only-scalars=true -polly-codegen -S < %s | \
+; RUN: opt %loadNPMPolly \
+; RUN: -polly-analyze-read-only-scalars=true -passes=polly-codegen -S < %s | \
 ; RUN: FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll b/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll
index c9e006a01204..edc03333a358 100644
--- a/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll
+++ b/polly/test/CodeGen/dominance_problem_after_early_codegen_bailout.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
 ;
 ; This caused dominance problems at some point as we do bail out during
 ; code generation. Just verify it runs through.
diff --git a/polly/test/CodeGen/empty_domain_in_context.ll b/polly/test/CodeGen/empty_domain_in_context.ll
index c67ace9502e1..a2fe805f402e 100644
--- a/polly/test/CodeGen/empty_domain_in_context.ll
+++ b/polly/test/CodeGen/empty_domain_in_context.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-optree -polly-opt-isl -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-optree,polly-opt-isl,polly-codegen' -S < %s | FileCheck %s
 ;
 ; llvm.org/PR35362
 ; isl codegen does not allow to generate isl_ast_expr from pw_aff which have an
diff --git a/polly/test/CodeGen/entry_with_trivial_phi.ll b/polly/test/CodeGen/entry_with_trivial_phi.ll
index b057690ab29b..f2c9da04d649 100644
--- a/polly/test/CodeGen/entry_with_trivial_phi.ll
+++ b/polly/test/CodeGen/entry_with_trivial_phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s
 ;
 ; The entry of this scop's simple region (entry.split => for.end) has an trivial
 ; PHI node. LCSSA may create such PHI nodes. This is a breakdown of this case in
diff --git a/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll b/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll
index 5673cc746b5f..2f1ec1a7872a 100644
--- a/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll
+++ b/polly/test/CodeGen/entry_with_trivial_phi_other_bb.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; The entry of this scop's simple region (entry.split => for.end) has an trivial
 ; PHI node that is used in a different of the scop region. LCSSA may create such
diff --git a/polly/test/CodeGen/error-stmt-in-non-affine-region.ll b/polly/test/CodeGen/error-stmt-in-non-affine-region.ll
index 9832afe7a5fd..63b6becd1957 100644
--- a/polly/test/CodeGen/error-stmt-in-non-affine-region.ll
+++ b/polly/test/CodeGen/error-stmt-in-non-affine-region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ; XFAIL: *
 ;
 ; CHECK-LABEL: polly.stmt.if.then:
diff --git a/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll b/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll
index 048847f3e322..008e16caf9c2 100644
--- a/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll
+++ b/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/exprModDiv.ll b/polly/test/CodeGen/exprModDiv.ll
index 936b018bc1ad..c9b419abe324 100644
--- a/polly/test/CodeGen/exprModDiv.ll
+++ b/polly/test/CodeGen/exprModDiv.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-import-jscop \
-; RUN:     -polly-codegen -S < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-import-jscop \
-; RUN:     -polly-codegen -polly-import-jscop-postfix=pow2 \
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
+; RUN:     -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
+; RUN:     -polly-import-jscop-postfix=pow2 \
 ; RUN:     -S < %s | FileCheck %s -check-prefix=POW2
 ;
 ;    void exprModDiv(float *A, float *B, float *C, long N, long p) {
diff --git a/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll b/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll
index d7588b3b8e00..1ca2413fd5e1 100644
--- a/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll
+++ b/polly/test/CodeGen/hoisted_load_escapes_through_phi.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -S -polly-codegen \
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen \
 ; RUN: -polly-invariant-load-hoisting=false < %s | FileCheck %s
-; RUN: opt %loadPolly -S -polly-codegen \
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen \
 ; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
 ;
 ; Check that we generate valid code even if the load of cont_STACKPOINTER is
diff --git a/polly/test/CodeGen/hoisting_1.ll b/polly/test/CodeGen/hoisting_1.ll
index 86b56637bc2c..1f065bec8032 100644
--- a/polly/test/CodeGen/hoisting_1.ll
+++ b/polly/test/CodeGen/hoisting_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -tbaa -polly-codegen -polly-allow-differing-element-types -disable-output %s
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -polly-allow-differing-element-types -disable-output %s
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/hoisting_2.ll b/polly/test/CodeGen/hoisting_2.ll
index 1f1be11c2d98..e76ee066af08 100644
--- a/polly/test/CodeGen/hoisting_2.ll
+++ b/polly/test/CodeGen/hoisting_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -tbaa -polly-codegen -polly-allow-differing-element-types -disable-output %s
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -polly-allow-differing-element-types -disable-output %s
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/inner_scev_sdiv_1.ll b/polly/test/CodeGen/inner_scev_sdiv_1.ll
index 1a463fc178d1..d210105c46ba 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_1.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s
 ;
 ; Excerpt from the test-suite's oggenc reduced using bugpoint.
 ;
diff --git a/polly/test/CodeGen/inner_scev_sdiv_2.ll b/polly/test/CodeGen/inner_scev_sdiv_2.ll
index 76138034603e..74b914d1d87a 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_2.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; The SCEV expression in this test case refers to a sequence of sdiv
 ; instructions, which are part of different bbs in the SCoP. When code
diff --git a/polly/test/CodeGen/inner_scev_sdiv_3.ll b/polly/test/CodeGen/inner_scev_sdiv_3.ll
index 874ead14ded2..33440457bd46 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_3.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; This test case has a inner SCEV sdiv that will escape the SCoP. Just check we
 ; do not crash and generate valid code.
diff --git a/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll b/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll
index 6514e18687e4..31c14e85f253 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_in_lb.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=CODEGEN
 ;
 ; CHECK: [N] -> { Stmt_bb11[i0, i1] : i0 < N and i1 >= 0 and 3i1 <= -3 + i0 };
 ; CODEGEN: polly
diff --git a/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll b/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll
index 032942923379..b42371b0891e 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_in_lb_invariant.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen \
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen \
 ; RUN:     < %s | FileCheck %s
 ;
 ; Check that this will not crash our code generation.
diff --git a/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll b/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll
index f7292ca3073a..45af63402c98 100644
--- a/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll
+++ b/polly/test/CodeGen/inner_scev_sdiv_in_rtc.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen \
+; RUN: opt %loadNPMPolly -passes=polly-codegen \
 ; RUN:     -S < %s | FileCheck %s
 ;
 ; This will just check that we generate valid code here.
diff --git a/polly/test/CodeGen/intrinsics_lifetime.ll b/polly/test/CodeGen/intrinsics_lifetime.ll
index 6141b3abdd8a..6dca218b6386 100644
--- a/polly/test/CodeGen/intrinsics_lifetime.ll
+++ b/polly/test/CodeGen/intrinsics_lifetime.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Verify that we remove the lifetime markers from everywhere.
 ;
diff --git a/polly/test/CodeGen/intrinsics_misc.ll b/polly/test/CodeGen/intrinsics_misc.ll
index c0a52fe97329..84164893ebf7 100644
--- a/polly/test/CodeGen/intrinsics_misc.ll
+++ b/polly/test/CodeGen/intrinsics_misc.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Verify that we remove the misc intrinsics  from the optimized SCoP.
 ;
diff --git a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll
index 6727247a7f04..e7cbf748bea7 100644
--- a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll
+++ b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
 ; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
 ;
 ; This crashed our codegen at some point, verify it runs through
diff --git a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll
index a573049c8f67..24e9240c234d 100644
--- a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll
+++ b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order-3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
 ; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
 ;
 ; This crashed our codegen at some point, verify it runs through
diff --git a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll
index e05ca9951434..d1d861e316ee 100644
--- a/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll
+++ b/polly/test/CodeGen/inv-load-lnt-crash-wrong-order.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
 ; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
 ;
 ; This crashed our codegen at some point, verify it runs through
diff --git a/polly/test/CodeGen/invariant-load-dimension.ll b/polly/test/CodeGen/invariant-load-dimension.ll
index 7793c3b3bee3..21e53055c56b 100644
--- a/polly/test/CodeGen/invariant-load-dimension.ll
+++ b/polly/test/CodeGen/invariant-load-dimension.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCOPS
-; RUN: opt %loadPolly -S < %s -polly-codegen -polly-process-unprofitable -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly -polly-process-unprofitable -polly-invariant-load-hoisting '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS
+; RUN: opt %loadNPMPolly -S < %s -passes=polly-codegen -polly-process-unprofitable -polly-invariant-load-hoisting | FileCheck %s -check-prefix=CODEGEN
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
 
diff --git a/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll b/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll
index 474100995fd8..1fd9cb81771c 100644
--- a/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll
+++ b/polly/test/CodeGen/invariant-load-preload-base-pointer-origin-first.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen -polly-invariant-load-hoisting=true < %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true < %s
 ;
 ; Check that we generate valid code as we did non preload the base pointer
 ; origin of %tmp4 at some point.
diff --git a/polly/test/CodeGen/invariant_cannot_handle_void.ll b/polly/test/CodeGen/invariant_cannot_handle_void.ll
index de5d13d6a69a..0859a4e4997e 100644
--- a/polly/test/CodeGen/invariant_cannot_handle_void.ll
+++ b/polly/test/CodeGen/invariant_cannot_handle_void.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting=true -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
-; RUN: opt %loadPolly -S -polly-codegen -polly-invariant-load-hoisting=true %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true %s | FileCheck %s
 ;
 ; The offset of the %tmp1 load wrt. to %buff (62 bytes) is not divisible
 ; by the type size (i32 = 4 bytes), thus we will have to represent %buff
diff --git a/polly/test/CodeGen/invariant_load.ll b/polly/test/CodeGen/invariant_load.ll
index be3f7a32f35b..2d5e6042ea6a 100644
--- a/polly/test/CodeGen/invariant_load.ll
+++ b/polly/test/CodeGen/invariant_load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK-NEXT:    %polly.access.B = getelementptr i32, ptr %B, i64 0
diff --git a/polly/test/CodeGen/invariant_load_address_space.ll b/polly/test/CodeGen/invariant_load_address_space.ll
index 7c611ad3dd87..3d1958e5b8a4 100644
--- a/polly/test/CodeGen/invariant_load_address_space.ll
+++ b/polly/test/CodeGen/invariant_load_address_space.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK-NEXT:    %polly.access.B = getelementptr i32, ptr addrspace(1) %B, i64 0
diff --git a/polly/test/CodeGen/invariant_load_alias_metadata.ll b/polly/test/CodeGen/invariant_load_alias_metadata.ll
index 5a82d82d43f8..252463384a5c 100644
--- a/polly/test/CodeGen/invariant_load_alias_metadata.ll
+++ b/polly/test/CodeGen/invariant_load_alias_metadata.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true \
 ; RUN: -S < %s | FileCheck %s
 ;
 ; This test case checks whether Polly generates alias metadata in case of
diff --git a/polly/test/CodeGen/invariant_load_base_pointer.ll b/polly/test/CodeGen/invariant_load_base_pointer.ll
index eb07f8317b79..d4ac433475f0 100644
--- a/polly/test/CodeGen/invariant_load_base_pointer.ll
+++ b/polly/test/CodeGen/invariant_load_base_pointer.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly  -polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly  -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK-NEXT:    %polly.access.BPLoc = getelementptr ptr, ptr %BPLoc, i64 0
diff --git a/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll b/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll
index 538077bb09e8..06a9a93363ed 100644
--- a/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll
+++ b/polly/test/CodeGen/invariant_load_base_pointer_conditional.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly  -polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly  -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK-NEXT:    %0 = sext i32 %N to i64
diff --git a/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll b/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll
index 7c2fb3ef97ed..66ab9a31b103 100644
--- a/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll
+++ b/polly/test/CodeGen/invariant_load_base_pointer_conditional_2.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting=true -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -S -polly-codegen -polly-invariant-load-hoisting=true < %s | FileCheck %s --check-prefix=IR
-; RUN: opt %loadPolly -S -polly-codegen -polly-invariant-load-hoisting=true --polly-overflow-tracking=always < %s | FileCheck %s --check-prefix=IRA
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true --polly-overflow-tracking=always < %s | FileCheck %s --check-prefix=IRA
 ;
 ; As (p + q) can overflow we have to check that we load from
 ; I[p + q] only if it does not.
diff --git a/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll b/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll
index dc5a4c890381..fa904e9b96d3 100644
--- a/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll
+++ b/polly/test/CodeGen/invariant_load_canonicalize_array_baseptrs.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s \
 ; RUN:  -polly-invariant-load-hoisting \
 ; RUN:  | FileCheck %s
 
diff --git a/polly/test/CodeGen/invariant_load_condition.ll b/polly/test/CodeGen/invariant_load_condition.ll
index edf0814d8983..36e588329d66 100644
--- a/polly/test/CodeGen/invariant_load_condition.ll
+++ b/polly/test/CodeGen/invariant_load_condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK-NEXT:     %polly.access.C = getelementptr i32, ptr %C, i64 0
diff --git a/polly/test/CodeGen/invariant_load_different_sized_types.ll b/polly/test/CodeGen/invariant_load_different_sized_types.ll
index 5b91a1901061..2995bce4c660 100644
--- a/polly/test/CodeGen/invariant_load_different_sized_types.ll
+++ b/polly/test/CodeGen/invariant_load_different_sized_types.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S \
 ; RUN: -polly-allow-differing-element-types < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/CodeGen/invariant_load_escaping.ll b/polly/test/CodeGen/invariant_load_escaping.ll
index efccdf468a18..416148b72303 100644
--- a/polly/test/CodeGen/invariant_load_escaping.ll
+++ b/polly/test/CodeGen/invariant_load_escaping.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ;    int f(int *A, int *B) {
 ;      // Possible aliasing between A and B but if not then *B would be
diff --git a/polly/test/CodeGen/invariant_load_escaping_second_scop.ll b/polly/test/CodeGen/invariant_load_escaping_second_scop.ll
index c0ea888acdde..906bfc1805d3 100644
--- a/polly/test/CodeGen/invariant_load_escaping_second_scop.ll
+++ b/polly/test/CodeGen/invariant_load_escaping_second_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true  -polly-process-unprofitable -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true  -polly-process-unprofitable -S < %s | FileCheck %s
 ;
 ;    void fence(void);
 ;
diff --git a/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll b/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll
index 241252b5d549..472c6c67a45e 100644
--- a/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll
+++ b/polly/test/CodeGen/invariant_load_in_non_affine_subregion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; This crashed at some point as the invariant load is in a non-affine
 ; subregion. Just check it does not anymore.
diff --git a/polly/test/CodeGen/invariant_load_loop_ub.ll b/polly/test/CodeGen/invariant_load_loop_ub.ll
index ab9aa0dc69a7..1db27ad8e58b 100644
--- a/polly/test/CodeGen/invariant_load_loop_ub.ll
+++ b/polly/test/CodeGen/invariant_load_loop_ub.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -polly-process-unprofitable -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-process-unprofitable -S < %s | FileCheck %s
 ;
 ; CHECK: polly.start
 ;
diff --git a/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll b/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll
index 08ff0871b610..01b01761d908 100644
--- a/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll
+++ b/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
 ;
 ; Check that this does not crash as the invariant load is not executed (thus
 ; not preloaded) but still referenced by one of the parameters.
diff --git a/polly/test/CodeGen/invariant_load_outermost.ll b/polly/test/CodeGen/invariant_load_outermost.ll
index f42135c09014..7e0550fb3be9 100644
--- a/polly/test/CodeGen/invariant_load_outermost.ll
+++ b/polly/test/CodeGen/invariant_load_outermost.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 
 ; CHECK: polly.start
 
diff --git a/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll b/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll
index d365c99eff66..abf957b556da 100644
--- a/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll
+++ b/polly/test/CodeGen/invariant_load_parameters_cyclic_dependence.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s --check-prefix=SCOP
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; SCOP:         Assumed Context:
 ; SCOP-NEXT:    [p_0, tmp4] -> {  :  }
diff --git a/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll b/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll
index b4d4c55f0d9b..b565f1bd5096 100644
--- a/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll
+++ b/polly/test/CodeGen/invariant_load_ptr_ptr_noalias.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -S  < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -S  < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK:   %polly.access.A = getelementptr ptr, ptr %A, i64 42
diff --git a/polly/test/CodeGen/invariant_load_scalar_dep.ll b/polly/test/CodeGen/invariant_load_scalar_dep.ll
index 05a40a4c47cc..ba2999e27984 100644
--- a/polly/test/CodeGen/invariant_load_scalar_dep.ll
+++ b/polly/test/CodeGen/invariant_load_scalar_dep.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly  -polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly  -passes=polly-codegen -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: polly.preload.begin:
 ; CHECK:    %polly.access.B = getelementptr i32, ptr %B, i64 0
diff --git a/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll b/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll
index 44c035855b76..26c964c9c6a7 100644
--- a/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll
+++ b/polly/test/CodeGen/invariant_load_scalar_escape_alloca_sharing.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s
 ;
 ; Verify the preloaded %tmp0 is stored and communicated in the same alloca.
 ; In this case, we do not reload %ncol.load from the scalar stack slot, but
diff --git a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll
index 0b6929a5fd3f..6bf11d5697bd 100644
--- a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll
+++ b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true < %s
 ;
 ; Check we do not crash even though we pre-load values with different types
 ; from the same base pointer.
diff --git a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll
index 2eb913fed447..07ce94152245 100644
--- a/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll
+++ b/polly/test/CodeGen/invariant_loads_from_struct_with_different_types_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true < %s
 ;
 ; Check we do not crash even though we pre-load values with different types
 ; from the same base pointer.
diff --git a/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll b/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll
index a0c1f891bdf6..19b30afd33ba 100644
--- a/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll
+++ b/polly/test/CodeGen/invariant_loads_ignore_parameter_bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting \
 ; RUN:     -polly-ignore-parameter-bounds -S < %s | FileCheck %s
 
 ; CHECK: polly.preload.begin:
diff --git a/polly/test/CodeGen/invariant_verify_function_failed.ll b/polly/test/CodeGen/invariant_verify_function_failed.ll
index 6020caeee85d..c9affac076e9 100644
--- a/polly/test/CodeGen/invariant_verify_function_failed.ll
+++ b/polly/test/CodeGen/invariant_verify_function_failed.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(polly-codegen)' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; This crashed at some point as the pointer returned by the call
 ; to @__errno_location is invariant and defined in the SCoP but not
diff --git a/polly/test/CodeGen/invariant_verify_function_failed_2.ll b/polly/test/CodeGen/invariant_verify_function_failed_2.ll
index 81a4bd1dc153..7ef5608d7d19 100644
--- a/polly/test/CodeGen/invariant_verify_function_failed_2.ll
+++ b/polly/test/CodeGen/invariant_verify_function_failed_2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -S -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s -check-prefix=SCOPS
-; RUN: opt %loadPolly -S -polly-codegen -polly-invariant-load-hoisting=true %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCOPS
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen -polly-invariant-load-hoisting=true %s | FileCheck %s
 ;
 ; Check we generate valid code.
 
diff --git a/polly/test/CodeGen/issue56692.ll b/polly/test/CodeGen/issue56692.ll
index e935e43bfa44..34c4e398e2ac 100644
--- a/polly/test/CodeGen/issue56692.ll
+++ b/polly/test/CodeGen/issue56692.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-parallel -polly-parallel-force -polly-omp-backend=LLVM -polly-codegen-verify -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-parallel -polly-parallel-force -polly-omp-backend=LLVM -polly-codegen-verify -passes=polly-codegen -S < %s | FileCheck %s
 ; https://github.com/llvm/llvm-project/issues/56692
 ;
 ; CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call({{.*}}), !dbg ![[OPTLOC:[0-9]+]]
diff --git a/polly/test/CodeGen/large-numbers-in-boundary-context.ll b/polly/test/CodeGen/large-numbers-in-boundary-context.ll
index a0328dfec651..b228baf9bdf2 100644
--- a/polly/test/CodeGen/large-numbers-in-boundary-context.ll
+++ b/polly/test/CodeGen/large-numbers-in-boundary-context.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ; XFAIL: *
 ;
 ; The boundary context contains a constant that does not fit in 64 bits. Hence,
diff --git a/polly/test/CodeGen/load_subset_with_context.ll b/polly/test/CodeGen/load_subset_with_context.ll
index ef0e051d5635..ccd4198b9fe8 100644
--- a/polly/test/CodeGen/load_subset_with_context.ll
+++ b/polly/test/CodeGen/load_subset_with_context.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; A load must provide a value for every statement instance.
 ; Statement instances not in the SCoP's context are irrelevant.
diff --git a/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll b/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll
index 90c61c591623..d9065858ff25 100644
--- a/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll
+++ b/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/CodeGen/loop_with_condition.ll b/polly/test/CodeGen/loop_with_condition.ll
index 618a542c179a..49e312404cca 100644
--- a/polly/test/CodeGen/loop_with_condition.ll
+++ b/polly/test/CodeGen/loop_with_condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#include <string.h>
 ;#define N 1024
diff --git a/polly/test/CodeGen/loop_with_condition_2.ll b/polly/test/CodeGen/loop_with_condition_2.ll
index b1a116785069..8ae38eeeb498 100644
--- a/polly/test/CodeGen/loop_with_condition_2.ll
+++ b/polly/test/CodeGen/loop_with_condition_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 
 ; Verify that we actually detect this loop as the innermost loop even though
 ; there is a conditional inside.
diff --git a/polly/test/CodeGen/loop_with_condition_ineq.ll b/polly/test/CodeGen/loop_with_condition_ineq.ll
index c35208c72dfe..64019a609021 100644
--- a/polly/test/CodeGen/loop_with_condition_ineq.ll
+++ b/polly/test/CodeGen/loop_with_condition_ineq.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#include <string.h>
 ;#define N 1024
diff --git a/polly/test/CodeGen/loop_with_condition_nested.ll b/polly/test/CodeGen/loop_with_condition_nested.ll
index 24a49b47d9e6..5dcb51dcb91c 100644
--- a/polly/test/CodeGen/loop_with_condition_nested.ll
+++ b/polly/test/CodeGen/loop_with_condition_nested.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -basic-aa -polly-codegen < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS
 
 
 ;#include <string.h>
diff --git a/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll b/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll
index 4444cf1dc4dd..26fe4eb82ae4 100644
--- a/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll
+++ b/polly/test/CodeGen/loop_with_conditional_entry_edge_split_hard_case.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Test case to trigger the hard way of creating a unique entering
 ; edge for the SCoP. It is triggered because the entering edge
diff --git a/polly/test/CodeGen/memcpy_annotations.ll b/polly/test/CodeGen/memcpy_annotations.ll
index a0a09b75c82e..501aa8fbea4d 100644
--- a/polly/test/CodeGen/memcpy_annotations.ll
+++ b/polly/test/CodeGen/memcpy_annotations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Verify that @llvm.memcpy does not get a !alias.scope annotation.
 ; @llvm.memcpy takes two pointers, it is ambiguous to which the
diff --git a/polly/test/CodeGen/multidim-non-matching-typesize-2.ll b/polly/test/CodeGen/multidim-non-matching-typesize-2.ll
index 63afad6e2f41..f63eb18118e7 100644
--- a/polly/test/CodeGen/multidim-non-matching-typesize-2.ll
+++ b/polly/test/CodeGen/multidim-non-matching-typesize-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -disable-basic-aa -polly-codegen \
+; RUN: opt %loadNPMPolly -disable-basic-aa -passes=polly-codegen \
 ; RUN:     -S < %s | FileCheck %s
 ; CHECK: polly
 target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
diff --git a/polly/test/CodeGen/multidim-non-matching-typesize.ll b/polly/test/CodeGen/multidim-non-matching-typesize.ll
index d117cefe3376..63e43c83ada5 100644
--- a/polly/test/CodeGen/multidim-non-matching-typesize.ll
+++ b/polly/test/CodeGen/multidim-non-matching-typesize.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -disable-basic-aa -polly-codegen \
+; RUN: opt %loadNPMPolly -disable-basic-aa -passes=polly-codegen \
 ; RUN:     -S < %s | FileCheck %s
 
 target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
diff --git a/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll b/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll
index 464ddb3740f7..86b17573caad 100644
--- a/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll
+++ b/polly/test/CodeGen/multidim_2d_parametric_array_static_loop_bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/CodeGen/multidim_alias_check.ll b/polly/test/CodeGen/multidim_alias_check.ll
index 585577da0e6d..93e34e2fd0fc 100644
--- a/polly/test/CodeGen/multidim_alias_check.ll
+++ b/polly/test/CodeGen/multidim_alias_check.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; CHECK: %polly.access.sext.A = sext i32 %n to i64
diff --git a/polly/test/CodeGen/multiple-codegens.ll b/polly/test/CodeGen/multiple-codegens.ll
index f950fa4a3e1d..2fa974e66df5 100644
--- a/polly/test/CodeGen/multiple-codegens.ll
+++ b/polly/test/CodeGen/multiple-codegens.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-scops -polly-opt-isl -polly-codegen -polly-scops -polly-codegen -S < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly "-passes=scop(polly-opt-isl,polly-codegen,polly-codegen)" -S < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly "-passes=scop(polly-opt-isl,polly-codegen),scop(polly-codegen)" -S < %s | FileCheck %s
 ;
diff --git a/polly/test/CodeGen/multiple-scops-in-a-row.ll b/polly/test/CodeGen/multiple-scops-in-a-row.ll
index a24a2e71ad4e..b81ba04e3646 100644
--- a/polly/test/CodeGen/multiple-scops-in-a-row.ll
+++ b/polly/test/CodeGen/multiple-scops-in-a-row.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 
 ; This test case has two scops in a row. When code generating the first scop,
 ; the second scop is invalidated. This test case verifies that we do not crash
diff --git a/polly/test/CodeGen/multiple-types-invariant-load-2.ll b/polly/test/CodeGen/multiple-types-invariant-load-2.ll
index 0fd1df75e2ec..f6aca37c932b 100644
--- a/polly/test/CodeGen/multiple-types-invariant-load-2.ll
+++ b/polly/test/CodeGen/multiple-types-invariant-load-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
 ; RUN: -polly-allow-differing-element-types < %s | FileCheck %s
 
 ; CHECK: polly
diff --git a/polly/test/CodeGen/multiple-types-invariant-load.ll b/polly/test/CodeGen/multiple-types-invariant-load.ll
index b1434679e3d1..930041eaddaa 100644
--- a/polly/test/CodeGen/multiple-types-invariant-load.ll
+++ b/polly/test/CodeGen/multiple-types-invariant-load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-differing-element-types -polly-codegen -S \
+; RUN: opt %loadNPMPolly -polly-allow-differing-element-types -passes=polly-codegen -S \
 ; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
 
 ; CHECK: %polly.access.global.load = getelementptr i32, ptr %global.load, i64 0
diff --git a/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll b/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll
index 0163f248229e..1e06a7e186bb 100644
--- a/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll
+++ b/polly/test/CodeGen/multiple_sai_fro_same_base_address.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-position=before-vectorizer -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
-; RUN: opt %loadPolly -polly-position=before-vectorizer -polly-codegen -S < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-position=before-vectorizer '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -polly-position=before-vectorizer -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=IR
 
 ; The IR has two ScopArrayInfo for the value %next.0. This used to produce two
 ; phi nodes in polly.merge_new_and_old, one illegaly using the result of the
diff --git a/polly/test/CodeGen/no-overflow-tracking.ll b/polly/test/CodeGen/no-overflow-tracking.ll
index f11e8927ddee..d5ad9a7aef23 100644
--- a/polly/test/CodeGen/no-overflow-tracking.ll
+++ b/polly/test/CodeGen/no-overflow-tracking.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting=true -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-invariant-load-hoisting=true -polly-overflow-tracking=never -polly-codegen -S < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true -polly-overflow-tracking=never -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=IR
 ;
 ; As (p + q) can overflow we have to check that we load from
 ; I[p + q] only if it does not.
diff --git a/polly/test/CodeGen/no_guard_bb.ll b/polly/test/CodeGen/no_guard_bb.ll
index 47c87ff7c868..a022083f43a9 100644
--- a/polly/test/CodeGen/no_guard_bb.ll
+++ b/polly/test/CodeGen/no_guard_bb.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S -verify-dom-info < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S -verify-dom-info < %s | FileCheck %s
 ;
 ; CHECK-NOT: br i1 true, label %polly.{{.*}}, label %polly.{{.*}}
 ;
diff --git a/polly/test/CodeGen/non-affine-dominance-generated-entering.ll b/polly/test/CodeGen/non-affine-dominance-generated-entering.ll
index ebf36acc8d96..6015516a3bc4 100644
--- a/polly/test/CodeGen/non-affine-dominance-generated-entering.ll
+++ b/polly/test/CodeGen/non-affine-dominance-generated-entering.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; llvm.org/PR25439
 ; Scalar reloads in the generated entering block were not recognized as
diff --git a/polly/test/CodeGen/non-affine-exit-node-dominance.ll b/polly/test/CodeGen/non-affine-exit-node-dominance.ll
index af19d2420e3e..0d0f634ed7c1 100644
--- a/polly/test/CodeGen/non-affine-exit-node-dominance.ll
+++ b/polly/test/CodeGen/non-affine-exit-node-dominance.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; llvm.org/PR25439
 ; The dominance of the generated non-affine subregion block was based on the
diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll b/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll
index 2aca316d4c88..b7394b248404 100644
--- a/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll
+++ b/polly/test/CodeGen/non-affine-phi-node-expansion-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen \
+; RUN: opt %loadNPMPolly -passes=polly-codegen \
 ; RUN:     -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll b/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll
index 18a4b6e4ed4a..b9386333a79b 100644
--- a/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll
+++ b/polly/test/CodeGen/non-affine-phi-node-expansion-3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen \
+; RUN: opt %loadNPMPolly -passes=polly-codegen \
 ; RUN:     -S < %s | FileCheck %s
 
 define void @foo(ptr %A, i1 %cond0, i1 %cond1) {
diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll b/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll
index 8a07ee7c7424..6460c427270f 100644
--- a/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll
+++ b/polly/test/CodeGen/non-affine-phi-node-expansion-4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen \
+; RUN: opt %loadNPMPolly -passes=polly-codegen \
 ; RUN:     -S < %s | FileCheck %s
 
 define void @foo(ptr %A, i1 %cond0, i1 %cond1) {
diff --git a/polly/test/CodeGen/non-affine-phi-node-expansion.ll b/polly/test/CodeGen/non-affine-phi-node-expansion.ll
index 091fc3e323dc..8fd8cc14124b 100644
--- a/polly/test/CodeGen/non-affine-phi-node-expansion.ll
+++ b/polly/test/CodeGen/non-affine-phi-node-expansion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen \
+; RUN: opt %loadNPMPolly -passes=polly-codegen \
 ; RUN:     -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll
index 6a1d1f12ba9c..007a4c586aa3 100644
--- a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll
+++ b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; This caused the code generation to generate invalid code as the same operand
 ; of the PHI node in the non-affine region was synthesized at the wrong place.
diff --git a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll
index 036bf34cb7f7..20edbf2bd6c0 100644
--- a/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll
+++ b/polly/test/CodeGen/non-affine-region-exit-phi-incoming-synthesize.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; This caused the code generation to generate invalid code as the same BBMap was
 ; used for the whole non-affine region. When %add is synthesized for the
diff --git a/polly/test/CodeGen/non-affine-region-implicit-store.ll b/polly/test/CodeGen/non-affine-region-implicit-store.ll
index e89197e24852..0ff39d3fe882 100644
--- a/polly/test/CodeGen/non-affine-region-implicit-store.ll
+++ b/polly/test/CodeGen/non-affine-region-implicit-store.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; llvm.org/PR25438
 ; After loop versioning, a dominance check of a non-affine subregion's exit node
diff --git a/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll b/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll
index f6e4eb57319d..7df3d8976ea8 100644
--- a/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll
+++ b/polly/test/CodeGen/non-affine-region-phi-references-in-scop-value.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-allow-nonaffine-loops \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-allow-nonaffine-loops \
 ; RUN: -S < %s | FileCheck %s
 
 ; This test verifies that values defined in another scop statement and used by
diff --git a/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll b/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll
index 6c749a404336..179062dd62d0 100644
--- a/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll
+++ b/polly/test/CodeGen/non-affine-subregion-dominance-reuse.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S -verify-dom-info \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S -verify-dom-info \
 ; RUN:     < %s | FileCheck %s
 ;
 ; Check that we do not reuse the B[i-1] GEP created in block S again in
diff --git a/polly/test/CodeGen/non-affine-switch.ll b/polly/test/CodeGen/non-affine-switch.ll
index 9c08b98700ae..427e7e2461f1 100644
--- a/polly/test/CodeGen/non-affine-switch.ll
+++ b/polly/test/CodeGen/non-affine-switch.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly \
-; RUN: -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly \
+; RUN: -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/CodeGen/non-affine-synthesized-in-branch.ll b/polly/test/CodeGen/non-affine-synthesized-in-branch.ll
index cc0e60abcd09..292c0f2b5394 100644
--- a/polly/test/CodeGen/non-affine-synthesized-in-branch.ll
+++ b/polly/test/CodeGen/non-affine-synthesized-in-branch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; llvm.org/PR25412
 ; %synthgep caused %gep to be synthesized in subregion_if which was reused for
diff --git a/polly/test/CodeGen/non-affine-update.ll b/polly/test/CodeGen/non-affine-update.ll
index d2b7fae75b23..03f091a40501 100644
--- a/polly/test/CodeGen/non-affine-update.ll
+++ b/polly/test/CodeGen/non-affine-update.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-import-jscop \
-; RUN:     -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
+; RUN:     -S < %s | FileCheck %s
 ;
 ;    void non-affine-update(double A[], double C[], double B[]) {
 ;      for (int i = 0; i < 10; i++) {
diff --git a/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll b/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll
index 5f6642b0630d..153cdb7ed9f6 100644
--- a/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll
+++ b/polly/test/CodeGen/non-hoisted-load-needed-as-base-ptr.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -tbaa -polly-codegen -disable-output %s
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -disable-output %s
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/non_affine_float_compare.ll b/polly/test/CodeGen/non_affine_float_compare.ll
index be310b5bf5ca..304a9016665c 100644
--- a/polly/test/CodeGen/non_affine_float_compare.ll
+++ b/polly/test/CodeGen/non_affine_float_compare.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen \
+; RUN: opt %loadNPMPolly -passes=polly-codegen \
 ; RUN:     -polly-allow-nonaffine-branches -S -verify-dom-info \
 ; RUN:     < %s | FileCheck %s
 ;
diff --git a/polly/test/CodeGen/only_non_affine_error_region.ll b/polly/test/CodeGen/only_non_affine_error_region.ll
index b2ad1c1fe3fd..445cef0d6f69 100644
--- a/polly/test/CodeGen/only_non_affine_error_region.ll
+++ b/polly/test/CodeGen/only_non_affine_error_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; CHECK-NOT: polly.start
 ;
diff --git a/polly/test/CodeGen/openmp_limit_threads.ll b/polly/test/CodeGen/openmp_limit_threads.ll
index e8eb819f13d9..4c33be340725 100644
--- a/polly/test/CodeGen/openmp_limit_threads.ll
+++ b/polly/test/CodeGen/openmp_limit_threads.ll
@@ -1,10 +1,10 @@
-; RUN: opt %loadPolly -polly-codegen -polly-parallel -S < %s | FileCheck %s --check-prefix=AUTO
-; RUN: opt %loadPolly -polly-codegen -polly-parallel -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=ONE
-; RUN: opt %loadPolly -polly-codegen -polly-parallel -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=FOUR
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -S < %s | FileCheck %s --check-prefix=AUTO
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=ONE
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=FOUR
 
-; RUN: opt %loadPolly -polly-codegen -polly-parallel -polly-omp-backend=LLVM -S < %s | FileCheck %s --check-prefix=LIBOMP-AUTO
-; RUN: opt %loadPolly -polly-codegen -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=LIBOMP-ONE
-; RUN: opt %loadPolly -polly-codegen -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=LIBOMP-FOUR
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-omp-backend=LLVM -S < %s | FileCheck %s --check-prefix=LIBOMP-AUTO
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=1 -S < %s | FileCheck %s --check-prefix=LIBOMP-ONE
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-parallel -polly-omp-backend=LLVM -polly-num-threads=4 -S < %s | FileCheck %s --check-prefix=LIBOMP-FOUR
 
 ; Ensure that the provided thread numbers are forwarded to the OpenMP calls.
 ;
diff --git a/polly/test/CodeGen/out-of-scop-phi-node-use.ll b/polly/test/CodeGen/out-of-scop-phi-node-use.ll
index 54e909ecf378..a4f942309ed2 100644
--- a/polly/test/CodeGen/out-of-scop-phi-node-use.ll
+++ b/polly/test/CodeGen/out-of-scop-phi-node-use.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/CodeGen/param_div_div_div_2.ll b/polly/test/CodeGen/param_div_div_div_2.ll
index 764ca241f166..8eba6444abb1 100644
--- a/polly/test/CodeGen/param_div_div_div_2.ll
+++ b/polly/test/CodeGen/param_div_div_div_2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s --check-prefix=IR
 ;
 ; Check that we guard the divisions because we moved them and thereby increased
 ; their domain.
diff --git a/polly/test/CodeGen/partial_write_array.ll b/polly/test/CodeGen/partial_write_array.ll
index 6dc5550d82af..8bb1bc2c3d8c 100644
--- a/polly/test/CodeGen/partial_write_array.ll
+++ b/polly/test/CodeGen/partial_write_array.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; Partial write of an array access.
 ;
diff --git a/polly/test/CodeGen/partial_write_emptyset.ll b/polly/test/CodeGen/partial_write_emptyset.ll
index a25195f11ed7..67828808e2fa 100644
--- a/polly/test/CodeGen/partial_write_emptyset.ll
+++ b/polly/test/CodeGen/partial_write_emptyset.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; Partial write, where "partial" is the empty set.
 ; The store is never executed in this case and we do generate it in the
diff --git a/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll b/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll
index 18a809b30557..b26bd81b5663 100644
--- a/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll
+++ b/polly/test/CodeGen/partial_write_full_write_that_appears_partial.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; CHECK:      polly.stmt.if.then81:                             ; preds = %polly.stmt.if.end75
 ; CHECK-NEXT:   store float undef, ptr %fX64, align 4, !alias.scope !0, !noalias !3
diff --git a/polly/test/CodeGen/partial_write_impossible_restriction.ll b/polly/test/CodeGen/partial_write_impossible_restriction.ll
index 178227fef8e5..edee3b913ce7 100644
--- a/polly/test/CodeGen/partial_write_impossible_restriction.ll
+++ b/polly/test/CodeGen/partial_write_impossible_restriction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; The isl scheduler isolates %cond.false into two instances.
 ; A partial write access in one of the instances was never executed,
diff --git a/polly/test/CodeGen/partial_write_in_region.ll b/polly/test/CodeGen/partial_write_in_region.ll
index d8f57b35d585..7c138c82091e 100644
--- a/polly/test/CodeGen/partial_write_in_region.ll
+++ b/polly/test/CodeGen/partial_write_in_region.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-import-jscop \
-; RUN: -polly-import-jscop-postfix=transformed -polly-codegen \
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
+; RUN: -polly-import-jscop-postfix=transformed \
 ; RUN: -verify-dom-info \
 ; RUN: -S < %s | FileCheck %s
 ;
diff --git a/polly/test/CodeGen/partial_write_in_region_with_loop.ll b/polly/test/CodeGen/partial_write_in_region_with_loop.ll
index 48a9dbef21d1..ba15a7871f43 100644
--- a/polly/test/CodeGen/partial_write_in_region_with_loop.ll
+++ b/polly/test/CodeGen/partial_write_in_region_with_loop.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-import-jscop \
-; RUN: -polly-import-jscop-postfix=transformed -polly-codegen \
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' \
+; RUN: -polly-import-jscop-postfix=transformed \
 ; RUN: -verify-dom-info -polly-allow-nonaffine-loops \
 ; RUN: -S < %s | FileCheck %s
 
diff --git a/polly/test/CodeGen/partial_write_mapped_scalar.ll b/polly/test/CodeGen/partial_write_mapped_scalar.ll
index 9137ef2123c8..b8c413885cdb 100644
--- a/polly/test/CodeGen/partial_write_mapped_scalar.ll
+++ b/polly/test/CodeGen/partial_write_mapped_scalar.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; Partial write of a (mapped) scalar.
 ;
diff --git a/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll b/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll
index e054b65eadf3..8c1953a05ad3 100644
--- a/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll
+++ b/polly/test/CodeGen/partial_write_mapped_scalar_subregion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; Partial write of a (mapped) scalar in a non-affine subregion.
 ;
diff --git a/polly/test/CodeGen/perf_monitoring.ll b/polly/test/CodeGen/perf_monitoring.ll
index 2abbf24f5e78..4b91e5055c0b 100644
--- a/polly/test/CodeGen/perf_monitoring.ll
+++ b/polly/test/CodeGen/perf_monitoring.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-codegen-perf-monitoring \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-perf-monitoring \
 ; RUN:   -S < %s | FileCheck %s
 
 ; void f(long A[], long N) {
diff --git a/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll b/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll
index 11d63fc47658..d5c33d64f341 100644
--- a/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll
+++ b/polly/test/CodeGen/perf_monitoring_cycles_per_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-codegen-perf-monitoring \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-perf-monitoring \
 ; RUN:   -S < %s | FileCheck %s
 
 ; void f(long A[], long N) {
diff --git a/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll b/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll
index 9b7f324df8e4..ab99c4d2de06 100644
--- a/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll
+++ b/polly/test/CodeGen/perf_monitoring_trip_counts_per_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-codegen-perf-monitoring \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-codegen-perf-monitoring \
 ; RUN:   -S < %s | FileCheck %s
 
 ; void f(long A[], long N) {
diff --git a/polly/test/CodeGen/phi-defined-before-scop.ll b/polly/test/CodeGen/phi-defined-before-scop.ll
index a3b1ba264f04..f08322281d3c 100644
--- a/polly/test/CodeGen/phi-defined-before-scop.ll
+++ b/polly/test/CodeGen/phi-defined-before-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; CHECK-LABEL: polly.merge_new_and_old:
 ; CHECK-NEXT: %tmp7.ph.merge = phi ptr [ %tmp7.ph.final_reload, %polly.exiting ], [ %tmp7.ph, %bb6.region_exiting ]
diff --git a/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll b/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll
index c34ebfc3ca02..e096aa2f4f8c 100644
--- a/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll
+++ b/polly/test/CodeGen/phi_after_error_block_outside_of_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; Make sure code generation does not break in case an 'error block' is detected
 ; outside of the scope. In this situation, we should not affect code generation.
diff --git a/polly/test/CodeGen/phi_condition_modeling_1.ll b/polly/test/CodeGen/phi_condition_modeling_1.ll
index b14d32921cf7..9d73d8a79255 100644
--- a/polly/test/CodeGen/phi_condition_modeling_1.ll
+++ b/polly/test/CodeGen/phi_condition_modeling_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    void f(int *A, int c, int N) {
 ;      int tmp;
diff --git a/polly/test/CodeGen/phi_condition_modeling_2.ll b/polly/test/CodeGen/phi_condition_modeling_2.ll
index dab2977bf065..2d1364842d73 100644
--- a/polly/test/CodeGen/phi_condition_modeling_2.ll
+++ b/polly/test/CodeGen/phi_condition_modeling_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S  -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S  -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    void f(int *A, int c, int N) {
 ;      int tmp;
diff --git a/polly/test/CodeGen/phi_conditional_simple_1.ll b/polly/test/CodeGen/phi_conditional_simple_1.ll
index f1b93b540f70..25bcf2a118ef 100644
--- a/polly/test/CodeGen/phi_conditional_simple_1.ll
+++ b/polly/test/CodeGen/phi_conditional_simple_1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    void jd(int *A, int c) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll
index 13688480e315..43d29b9ec864 100644
--- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll
+++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; This caused an lnt crash at some point, just verify it will run through.
 ;
diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll
index 01dd450590d9..9f28024fcfa0 100644
--- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll
+++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; This caused an lnt crash at some point, just verify it will run through and
 ; produce the PHI node in the exit we are looking for.
diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll
index 66b95b0e0317..73e99ac0f32c 100644
--- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll
+++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; This caused an lnt crash at some point, just verify it will run through and
 ; produce the PHI node in the exit we are looking for.
diff --git a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll
index 9a046367e768..6c9bd56a9872 100644
--- a/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll
+++ b/polly/test/CodeGen/phi_in_exit_early_lnt_failure_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; This caused an lnt crash at some point, just verify it will run through and
 ; produce the PHI node in the exit we are looking for.
diff --git a/polly/test/CodeGen/phi_loop_carried_float.ll b/polly/test/CodeGen/phi_loop_carried_float.ll
index ca1870fb3a09..d671db08b06c 100644
--- a/polly/test/CodeGen/phi_loop_carried_float.ll
+++ b/polly/test/CodeGen/phi_loop_carried_float.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S  -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S  -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    float f(float *A, int N) {
 ;      float tmp = 0;
diff --git a/polly/test/CodeGen/phi_loop_carried_float_escape.ll b/polly/test/CodeGen/phi_loop_carried_float_escape.ll
index 3b2ed01863b1..3e244c5e1332 100644
--- a/polly/test/CodeGen/phi_loop_carried_float_escape.ll
+++ b/polly/test/CodeGen/phi_loop_carried_float_escape.ll
@@ -1,8 +1,8 @@
-; RUN: opt %loadPolly -S \
-; RUN: -polly-analyze-read-only-scalars=false -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S \
+; RUN: -polly-analyze-read-only-scalars=false -passes=polly-codegen < %s | FileCheck %s
 
-; RUN: opt %loadPolly -S \
-; RUN: -polly-analyze-read-only-scalars=true -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S \
+; RUN: -polly-analyze-read-only-scalars=true -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    float f(float *A, int N) {
 ;      float tmp = 0;
diff --git a/polly/test/CodeGen/phi_scalar_simple_1.ll b/polly/test/CodeGen/phi_scalar_simple_1.ll
index d62975b6a7b3..80a1c41b83ac 100644
--- a/polly/test/CodeGen/phi_scalar_simple_1.ll
+++ b/polly/test/CodeGen/phi_scalar_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    int jd(int *restrict A, int x, int N) {
 ;      for (int i = 1; i < N; i++)
diff --git a/polly/test/CodeGen/phi_scalar_simple_2.ll b/polly/test/CodeGen/phi_scalar_simple_2.ll
index e58945d39960..614c8acfb9f8 100644
--- a/polly/test/CodeGen/phi_scalar_simple_2.ll
+++ b/polly/test/CodeGen/phi_scalar_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    int jd(int *restrict A, int x, int N, int c) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll b/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll
index 17e4b7d6b4de..7e21666f1db0 100644
--- a/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll
+++ b/polly/test/CodeGen/phi_with_multi_exiting_edges_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; CHECK: polly.merge_new_and_old:
 ; CHECK:   %result.ph.merge = phi float [ %result.ph.final_reload, %polly.exiting ], [ %result.ph, %next.region_exiting ]
diff --git a/polly/test/CodeGen/phi_with_one_exit_edge.ll b/polly/test/CodeGen/phi_with_one_exit_edge.ll
index 81fd73b51c79..36a8684dbc37 100644
--- a/polly/test/CodeGen/phi_with_one_exit_edge.ll
+++ b/polly/test/CodeGen/phi_with_one_exit_edge.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ;
 ; CHECK: polly.merge_new_and_old:
diff --git a/polly/test/CodeGen/pointer-type-expressions-2.ll b/polly/test/CodeGen/pointer-type-expressions-2.ll
index b261cfe53321..918e4c6c9c0b 100644
--- a/polly/test/CodeGen/pointer-type-expressions-2.ll
+++ b/polly/test/CodeGen/pointer-type-expressions-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define void @foo(ptr %start, ptr %end) {
diff --git a/polly/test/CodeGen/pointer-type-expressions.ll b/polly/test/CodeGen/pointer-type-expressions.ll
index 6bb3fa242362..e7feebc163d4 100644
--- a/polly/test/CodeGen/pointer-type-expressions.ll
+++ b/polly/test/CodeGen/pointer-type-expressions.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN
 
 ; void f(int a[], int N, float *P) {
 ;   int i;
diff --git a/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll b/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll
index eaef64017aa7..9ee050a1e507 100644
--- a/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll
+++ b/polly/test/CodeGen/pointer-type-pointer-type-comparison.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN
 ;
 
 ;    void f(int a[], int N, float *P, float *Q) {
diff --git a/polly/test/CodeGen/pointer_rem.ll b/polly/test/CodeGen/pointer_rem.ll
index 5c92ee52da2c..b8202318a3ec 100644
--- a/polly/test/CodeGen/pointer_rem.ll
+++ b/polly/test/CodeGen/pointer_rem.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-scops -polly-print-ast -disable-output -S < %s | FileCheck %s --check-prefix=AST
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-scops -polly-codegen -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>,scop(print<polly-ast>)' -disable-output -S < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>,scop(polly-codegen)' -S < %s | FileCheck %s --check-prefix=CODEGEN
 
 target datalayout = "e-m:e-i64:64-i128:128-n8:16:32:64-S128"
 target triple = "aarch64--linux-gnu"
diff --git a/polly/test/CodeGen/pr25241.ll b/polly/test/CodeGen/pr25241.ll
index 9fa67e083a6c..4a4add8ba2a6 100644
--- a/polly/test/CodeGen/pr25241.ll
+++ b/polly/test/CodeGen/pr25241.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; PR25241 (https://llvm.org/bugs/show_bug.cgi?id=25241)
 ; Ensure that synthesized values of a PHI node argument are generated in the
diff --git a/polly/test/CodeGen/ptrtoint_as_parameter.ll b/polly/test/CodeGen/ptrtoint_as_parameter.ll
index 4f6c8079729d..a551d810c080 100644
--- a/polly/test/CodeGen/ptrtoint_as_parameter.ll
+++ b/polly/test/CodeGen/ptrtoint_as_parameter.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; CHECK:      if.then260:
 ; CHECK-NEXT:   %p.4 = getelementptr inbounds i8, ptr null, i64 1
diff --git a/polly/test/CodeGen/read-only-scalars.ll b/polly/test/CodeGen/read-only-scalars.ll
index a5e1d2719d7d..365cbbce495f 100644
--- a/polly/test/CodeGen/read-only-scalars.ll
+++ b/polly/test/CodeGen/read-only-scalars.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-analyze-read-only-scalars=false -polly-codegen \
+; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false -passes=polly-codegen \
 ; RUN:     \
 ; RUN:     -S < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-analyze-read-only-scalars=true -polly-codegen \
+; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true -passes=polly-codegen \
 ; RUN:     \
 ; RUN:     -S < %s | FileCheck %s -check-prefix=SCALAR
 
diff --git a/polly/test/CodeGen/reduction.ll b/polly/test/CodeGen/reduction.ll
index 6e5a230ad231..8c5f70770a1c 100644
--- a/polly/test/CodeGen/reduction.ll
+++ b/polly/test/CodeGen/reduction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s 2>&1 | not FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | not FileCheck %s
 
 ;#include <string.h>
 ;#include <stdio.h>
diff --git a/polly/test/CodeGen/reduction_2.ll b/polly/test/CodeGen/reduction_2.ll
index 7a50cea31400..4aa306775e78 100644
--- a/polly/test/CodeGen/reduction_2.ll
+++ b/polly/test/CodeGen/reduction_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-invariant-load-hoisting=true -polly-print-ast -disable-output < %s | FileCheck %s --allow-empty
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-invariant-load-hoisting=true '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s --allow-empty
 
 ;#include <string.h>
 ;#include <stdio.h>
diff --git a/polly/test/CodeGen/reduction_simple_binary.ll b/polly/test/CodeGen/reduction_simple_binary.ll
index c7c5501bb7ed..0fe1085dbbac 100644
--- a/polly/test/CodeGen/reduction_simple_binary.ll
+++ b/polly/test/CodeGen/reduction_simple_binary.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK: pragma simd reduction
 ;
diff --git a/polly/test/CodeGen/region-with-instructions.ll b/polly/test/CodeGen/region-with-instructions.ll
index 28cabefbf68b..e5f7d0f9ef5d 100644
--- a/polly/test/CodeGen/region-with-instructions.ll
+++ b/polly/test/CodeGen/region-with-instructions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; CHECK-LABEL:   polly.stmt.bb48:
 ; CHECK-NEXT:   %[[offset:.*]] = shl i64 %polly.indvar, 3
diff --git a/polly/test/CodeGen/region_exiting-domtree.ll b/polly/test/CodeGen/region_exiting-domtree.ll
index 05983da0a3e3..06e0d9df3d95 100644
--- a/polly/test/CodeGen/region_exiting-domtree.ll
+++ b/polly/test/CodeGen/region_exiting-domtree.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -verify-dom-info -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-dom-info -disable-output < %s
 
 ; Verify that the DominatorTree is preserved correctly for the inserted
 ; %polly.stmt.exit.exit block, which serves as new exit block for the generated
diff --git a/polly/test/CodeGen/region_multiexit_partialwrite.ll b/polly/test/CodeGen/region_multiexit_partialwrite.ll
index b98d7f58732a..39e04dbf93ac 100644
--- a/polly/test/CodeGen/region_multiexit_partialwrite.ll
+++ b/polly/test/CodeGen/region_multiexit_partialwrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-codegen' -polly-import-jscop-postfix=transformed -S < %s | FileCheck %s
 ;
 ; This text case has a partial write of PHI in a region-statement. It
 ; requires that the new PHINode from the region's exiting block is
diff --git a/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll b/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll
index 0f62a8c743df..4afaab5bbad0 100644
--- a/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll
+++ b/polly/test/CodeGen/run-time-condition-with-scev-parameters.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; TODO: FIXME: Simplify the context.
 ; AST: if (n >= 1 && 0 == n <= -1)
diff --git a/polly/test/CodeGen/run-time-condition.ll b/polly/test/CodeGen/run-time-condition.ll
index 0faefad8aef4..914b76f5e0be 100644
--- a/polly/test/CodeGen/run-time-condition.ll
+++ b/polly/test/CodeGen/run-time-condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
diff --git a/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll b/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll
index 3f88942c2300..0b49da0d0e09 100644
--- a/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll
+++ b/polly/test/CodeGen/scalar-references-used-in-scop-compute.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; Test the code generation in the presence of a scalar out-of-scop value being
 ; used from within the SCoP.
diff --git a/polly/test/CodeGen/scalar-store-from-same-bb.ll b/polly/test/CodeGen/scalar-store-from-same-bb.ll
index ac8fab4b7a0d..3f232da37e4c 100644
--- a/polly/test/CodeGen/scalar-store-from-same-bb.ll
+++ b/polly/test/CodeGen/scalar-store-from-same-bb.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly \
-; RUN: -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly \
+; RUN: -passes=polly-codegen -S < %s | FileCheck %s
 
 ; This test ensures that the expression N + 1 that is stored in the phi-node
 ; alloca, is directly computed and not incorrectly transfered through memory.
diff --git a/polly/test/CodeGen/scalar_codegen_crash.ll b/polly/test/CodeGen/scalar_codegen_crash.ll
index c41a00f59e81..375f097283b0 100644
--- a/polly/test/CodeGen/scalar_codegen_crash.ll
+++ b/polly/test/CodeGen/scalar_codegen_crash.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly \
-; RUN:     -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly \
+; RUN:     -passes=polly-codegen -S < %s | FileCheck %s
 
 ; This test cases used to crash the scalar code generation. Check that we
 ; can generate code for it.
diff --git a/polly/test/CodeGen/scev-backedgetaken.ll b/polly/test/CodeGen/scev-backedgetaken.ll
index 15e12ee8b451..f5e68ec930d1 100644
--- a/polly/test/CodeGen/scev-backedgetaken.ll
+++ b/polly/test/CodeGen/scev-backedgetaken.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; llvm.org/PR48422
 ; Use of ScalarEvolution in Codegen not possible because DominatorTree is not updated.
diff --git a/polly/test/CodeGen/scev-division-invariant-load.ll b/polly/test/CodeGen/scev-division-invariant-load.ll
index 3156bdc9f5ce..70f090eae07b 100644
--- a/polly/test/CodeGen/scev-division-invariant-load.ll
+++ b/polly/test/CodeGen/scev-division-invariant-load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s
 ;
 ; Check that we generate valid code as we did not use the preloaded
 ; value of %tmp1 for the access function of the preloaded %tmp4.
diff --git a/polly/test/CodeGen/scev.ll b/polly/test/CodeGen/scev.ll
index 07d726d97caf..e2b5afda1bff 100644
--- a/polly/test/CodeGen/scev.ll
+++ b/polly/test/CodeGen/scev.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-detect < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define fastcc void @f () inlinehint align 2 {
diff --git a/polly/test/CodeGen/scev_expansion_in_nonaffine.ll b/polly/test/CodeGen/scev_expansion_in_nonaffine.ll
index f61f21d4adb8..6c6c2572da10 100644
--- a/polly/test/CodeGen/scev_expansion_in_nonaffine.ll
+++ b/polly/test/CodeGen/scev_expansion_in_nonaffine.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
 ; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
 
 ; bugpoint-reduced testcase of MiBench/consumer-lame/quantize-pvt.c from the
diff --git a/polly/test/CodeGen/scev_looking_through_bitcasts.ll b/polly/test/CodeGen/scev_looking_through_bitcasts.ll
index c87d932479b7..142e83f820fe 100644
--- a/polly/test/CodeGen/scev_looking_through_bitcasts.ll
+++ b/polly/test/CodeGen/scev_looking_through_bitcasts.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Scalar write of bitcasted value. Instead of writing %b of type
 ; %structty, the SCEV expression looks through the bitcast such that
diff --git a/polly/test/CodeGen/scop_expander_insert_point.ll b/polly/test/CodeGen/scop_expander_insert_point.ll
index 8492873b22ed..92f2772155ee 100644
--- a/polly/test/CodeGen/scop_expander_insert_point.ll
+++ b/polly/test/CodeGen/scop_expander_insert_point.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
 ; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
 ;
 ; CHECK:      entry:
diff --git a/polly/test/CodeGen/scop_expander_segfault.ll b/polly/test/CodeGen/scop_expander_segfault.ll
index 293c1e527959..d94a1fdfb2c1 100644
--- a/polly/test/CodeGen/scop_expander_segfault.ll
+++ b/polly/test/CodeGen/scop_expander_segfault.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S %s | FileCheck %s
 ;
 ; This test was extracted from gcc in SPEC2006 and it crashed our code
 ; generation, or to be more precise, the ScopExpander due to a endless
diff --git a/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll b/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll
index 91a58159b5f9..9f968e5657c9 100644
--- a/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll
+++ b/polly/test/CodeGen/scop_never_executed_runtime_check_location.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; Verify that we generate the runtime check code after the conditional branch
 ; in the SCoP region entering block (here %entry).
diff --git a/polly/test/CodeGen/select-base-pointer.ll b/polly/test/CodeGen/select-base-pointer.ll
index 29bc40074e1f..85be37755c47 100644
--- a/polly/test/CodeGen/select-base-pointer.ll
+++ b/polly/test/CodeGen/select-base-pointer.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -tbaa -polly-codegen -disable-output %s
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa -passes=polly-codegen -disable-output %s
 ;
 ; Check that we do not crash here.
 ;
diff --git a/polly/test/CodeGen/sequential_loops.ll b/polly/test/CodeGen/sequential_loops.ll
index 97d280de3cd2..33a3ee9fbbd4 100644
--- a/polly/test/CodeGen/sequential_loops.ll
+++ b/polly/test/CodeGen/sequential_loops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#include <string.h>
 ;#define N 1024
diff --git a/polly/test/CodeGen/simple_loop_non_single_exit.ll b/polly/test/CodeGen/simple_loop_non_single_exit.ll
index dc1b09b765a1..a7e36bc4c733 100644
--- a/polly/test/CodeGen/simple_loop_non_single_exit.ll
+++ b/polly/test/CodeGen/simple_loop_non_single_exit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/CodeGen/simple_loop_non_single_exit_2.ll b/polly/test/CodeGen/simple_loop_non_single_exit_2.ll
index 178601cac9b8..22e9da09ef85 100644
--- a/polly/test/CodeGen/simple_loop_non_single_exit_2.ll
+++ b/polly/test/CodeGen/simple_loop_non_single_exit_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/CodeGen/simple_non_single_entry.ll b/polly/test/CodeGen/simple_non_single_entry.ll
index 3b4bf59bdc65..c33a77ae0793 100644
--- a/polly/test/CodeGen/simple_non_single_entry.ll
+++ b/polly/test/CodeGen/simple_non_single_entry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s -check-prefix=CHECK-CODE
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/CodeGen/simple_nonaffine_loop.ll b/polly/test/CodeGen/simple_nonaffine_loop.ll
index d4e9c6082e6c..bc62047a80a3 100644
--- a/polly/test/CodeGen/simple_nonaffine_loop.ll
+++ b/polly/test/CodeGen/simple_nonaffine_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-allow-nonaffine -disable-output < %s | FileCheck %s
 
 ;#include <stdio.h>
 ;#include <stdlib.h>
diff --git a/polly/test/CodeGen/single_do_loop_int_max_iterations.ll b/polly/test/CodeGen/single_do_loop_int_max_iterations.ll
index 9648fbe1cf12..a65e3a25f035 100644
--- a/polly/test/CodeGen/single_do_loop_int_max_iterations.ll
+++ b/polly/test/CodeGen/single_do_loop_int_max_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#define N 20
 ;#include "limits.h"
diff --git a/polly/test/CodeGen/single_do_loop_int_param_iterations.ll b/polly/test/CodeGen/single_do_loop_int_param_iterations.ll
index f28d828a5da0..acccb48f18a3 100644
--- a/polly/test/CodeGen/single_do_loop_int_param_iterations.ll
+++ b/polly/test/CodeGen/single_do_loop_int_param_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 ; XFAIL: *
 
 ;define N 20
diff --git a/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll b/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll
index 68aaab96083a..7a67f6ba96ce 100644
--- a/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll
+++ b/polly/test/CodeGen/single_do_loop_ll_max_iterations.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s
 
 ;#define N 20
 ;#include "limits.h"
diff --git a/polly/test/CodeGen/single_do_loop_one_iteration.ll b/polly/test/CodeGen/single_do_loop_one_iteration.ll
index 9d97cb854734..2d939167b71e 100644
--- a/polly/test/CodeGen/single_do_loop_one_iteration.ll
+++ b/polly/test/CodeGen/single_do_loop_one_iteration.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 ; XFAIL: *
 
 ;#define N 20
diff --git a/polly/test/CodeGen/single_do_loop_scev_replace.ll b/polly/test/CodeGen/single_do_loop_scev_replace.ll
index 7963d9d29fe8..83c9e9d0324c 100644
--- a/polly/test/CodeGen/single_do_loop_scev_replace.ll
+++ b/polly/test/CodeGen/single_do_loop_scev_replace.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#define N 20
 ;#include "limits.h"
diff --git a/polly/test/CodeGen/single_loop.ll b/polly/test/CodeGen/single_loop.ll
index 68cc498b43e0..2db34663e93c 100644
--- a/polly/test/CodeGen/single_loop.ll
+++ b/polly/test/CodeGen/single_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#include <string.h>
 ;#define N 1024
diff --git a/polly/test/CodeGen/single_loop_int_max_iterations.ll b/polly/test/CodeGen/single_loop_int_max_iterations.ll
index bfb5e4ab2698..f83e8823c63d 100644
--- a/polly/test/CodeGen/single_loop_int_max_iterations.ll
+++ b/polly/test/CodeGen/single_loop_int_max_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#define N 20
 ;#include "limits.h"
diff --git a/polly/test/CodeGen/single_loop_ll_max_iterations.ll b/polly/test/CodeGen/single_loop_ll_max_iterations.ll
index bdfd7fce4204..1427189d74a7 100644
--- a/polly/test/CodeGen/single_loop_ll_max_iterations.ll
+++ b/polly/test/CodeGen/single_loop_ll_max_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#include "limits.h"
 ;#define N 20
diff --git a/polly/test/CodeGen/single_loop_one_iteration.ll b/polly/test/CodeGen/single_loop_one_iteration.ll
index 7d4dd590fab9..1a70d4a879d8 100644
--- a/polly/test/CodeGen/single_loop_one_iteration.ll
+++ b/polly/test/CodeGen/single_loop_one_iteration.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ;#define N 20
 ;
diff --git a/polly/test/CodeGen/single_loop_param.ll b/polly/test/CodeGen/single_loop_param.ll
index 5d72da354fdc..44ce1236e9f8 100644
--- a/polly/test/CodeGen/single_loop_param.ll
+++ b/polly/test/CodeGen/single_loop_param.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @A = common global [1024 x i32] zeroinitializer, align 16 ; <ptr> [#uses=3]
diff --git a/polly/test/CodeGen/single_loop_param_less_equal.ll b/polly/test/CodeGen/single_loop_param_less_equal.ll
index e63ee299a37c..fda9bfab11b8 100644
--- a/polly/test/CodeGen/single_loop_param_less_equal.ll
+++ b/polly/test/CodeGen/single_loop_param_less_equal.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen  -S < %s | FileCheck %s -check-prefix=CODEGEN
-; RUN: opt %loadPolly -polly-codegen < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen  -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly -passes=polly-codegen < %s | opt -passes='print<loops>' -disable-output 2>&1 | FileCheck %s -check-prefix=LOOPS
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @A = common global [1024 x i32] zeroinitializer
diff --git a/polly/test/CodeGen/single_loop_param_less_than.ll b/polly/test/CodeGen/single_loop_param_less_than.ll
index 95130f926450..b888c860eacd 100644
--- a/polly/test/CodeGen/single_loop_param_less_than.ll
+++ b/polly/test/CodeGen/single_loop_param_less_than.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen  -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen  -S < %s | FileCheck %s -check-prefix=CODEGEN
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @A = common global [1024 x i32] zeroinitializer
diff --git a/polly/test/CodeGen/single_loop_zero_iterations.ll b/polly/test/CodeGen/single_loop_zero_iterations.ll
index 4f189687d330..b1ce491b5c8a 100644
--- a/polly/test/CodeGen/single_loop_zero_iterations.ll
+++ b/polly/test/CodeGen/single_loop_zero_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=SCALAR --allow-empty
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=SCALAR --allow-empty
 
 ;#define N 20
 ;
diff --git a/polly/test/CodeGen/split_edge_of_exit.ll b/polly/test/CodeGen/split_edge_of_exit.ll
index 56ce215a62b2..f4b17e687ada 100644
--- a/polly/test/CodeGen/split_edge_of_exit.ll
+++ b/polly/test/CodeGen/split_edge_of_exit.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -verify-region-info -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-region-info -disable-output < %s
 ;
 ; This is a scop directly precedented by a region, i.e. the scop's entry is the
 ; region's exit block. This test is to ensure that the RegionInfo is correctly
diff --git a/polly/test/CodeGen/split_edges.ll b/polly/test/CodeGen/split_edges.ll
index e01d901e298c..b921202285bb 100644
--- a/polly/test/CodeGen/split_edges.ll
+++ b/polly/test/CodeGen/split_edges.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -verify-region-info -verify-dom-info -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-region-info -verify-dom-info -S < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @A = common global [1536 x float] zeroinitializer
diff --git a/polly/test/CodeGen/split_edges_2.ll b/polly/test/CodeGen/split_edges_2.ll
index 4135d6feeb3e..8f4d48f5dcb0 100644
--- a/polly/test/CodeGen/split_edges_2.ll
+++ b/polly/test/CodeGen/split_edges_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -verify-region-info -verify-dom-info -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -verify-region-info -verify-dom-info -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/CodeGen/srem-in-other-bb.ll b/polly/test/CodeGen/srem-in-other-bb.ll
index 8bde1a3bbc1d..a13a1b6ab98f 100644
--- a/polly/test/CodeGen/srem-in-other-bb.ll
+++ b/polly/test/CodeGen/srem-in-other-bb.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S \
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
 ; RUN:     < %s | FileCheck %s
 ;
 ;    void pos(float *A, long n) {
diff --git a/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll b/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll
index 02dfe96e3e91..cb9d9a2ec492 100644
--- a/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll
+++ b/polly/test/CodeGen/stack-overflow-in-load-hoisting.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -verify-dom-info -polly-codegen -S < %s \
+; RUN: opt %loadNPMPolly -verify-dom-info -passes=polly-codegen -S < %s \
 ; RUN: -polly-invariant-load-hoisting=true | FileCheck %s
 ;
 ; This caused an infinite recursion during invariant load hoisting at some
diff --git a/polly/test/CodeGen/stmt_split_no_dependence.ll b/polly/test/CodeGen/stmt_split_no_dependence.ll
index a395aa14b4c8..381cd30a2ae6 100644
--- a/polly/test/CodeGen/stmt_split_no_dependence.ll
+++ b/polly/test/CodeGen/stmt_split_no_dependence.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; CHECK:   store i32 %9, ptr %scevgep, align 4, !alias.scope !1, !noalias !4
 ; CHECK:   store i32 %11, ptr %scevgep4, align 4, !alias.scope !4, !noalias !1
diff --git a/polly/test/CodeGen/switch-in-non-affine-region.ll b/polly/test/CodeGen/switch-in-non-affine-region.ll
index 930755ef5648..1a9e7081bebd 100644
--- a/polly/test/CodeGen/switch-in-non-affine-region.ll
+++ b/polly/test/CodeGen/switch-in-non-affine-region.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly \
-; RUN: -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly \
+; RUN: -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll b/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll
index 6a8d3b94d1cc..b2a062363eef 100644
--- a/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll
+++ b/polly/test/CodeGen/synthesizable_phi_write_after_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Check for the correct written value of a scalar phi write whose value is
 ; defined within the loop, but its effective value is its last definition when
diff --git a/polly/test/CodeGen/test-invalid-operands-for-select-2.ll b/polly/test/CodeGen/test-invalid-operands-for-select-2.ll
index 5fa4773398fd..5668063c27c8 100644
--- a/polly/test/CodeGen/test-invalid-operands-for-select-2.ll
+++ b/polly/test/CodeGen/test-invalid-operands-for-select-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen -verify-loop-info < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen -verify-loop-info < %s | FileCheck %s
 ;
 ; Check that we do not crash as described here: http://llvm.org/bugs/show_bug.cgi?id=21167
 ;
diff --git a/polly/test/CodeGen/test-invalid-operands-for-select.ll b/polly/test/CodeGen/test-invalid-operands-for-select.ll
index 40695af3e847..9f5013cf1bb1 100644
--- a/polly/test/CodeGen/test-invalid-operands-for-select.ll
+++ b/polly/test/CodeGen/test-invalid-operands-for-select.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; Check that we do not crash as described here: http://llvm.org/PR21167
 ;
diff --git a/polly/test/CodeGen/test.ll b/polly/test/CodeGen/test.ll
index ac99688ed9e8..aad998ba2728 100644
--- a/polly/test/CodeGen/test.ll
+++ b/polly/test/CodeGen/test.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 ; XFAIL: *
 
 ;int bar1();
diff --git a/polly/test/CodeGen/two-loops-right-after-each-other-2.ll b/polly/test/CodeGen/two-loops-right-after-each-other-2.ll
index a7cae0a921ca..1c68389eaeba 100644
--- a/polly/test/CodeGen/two-loops-right-after-each-other-2.ll
+++ b/polly/test/CodeGen/two-loops-right-after-each-other-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 
 ; CHECK:       polly.merge_new_and_old:
 ; CHECK-NEXT:    merge = phi
diff --git a/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll b/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll
index 4470f970fc1e..4396c38310dc 100644
--- a/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll
+++ b/polly/test/CodeGen/two-scops-in-row-invalidate-scevs.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; CHECK-LABEL: for.cond:
 ; CHECK:         %num.0 = phi i32 [ %add, %for.body15 ], [ 0, %for.cond.pre_entry_bb ]
diff --git a/polly/test/CodeGen/two-scops-in-row.ll b/polly/test/CodeGen/two-scops-in-row.ll
index 3e922cba1916..dd3f310ef150 100644
--- a/polly/test/CodeGen/two-scops-in-row.ll
+++ b/polly/test/CodeGen/two-scops-in-row.ll
@@ -1,6 +1,6 @@
 
-; RUN: opt %loadPolly -polly-print-ast -polly-ignore-aliasing -disable-output < %s | FileCheck %s -check-prefix=SCALAR
-; RUN: opt %loadPolly -polly-codegen -polly-ignore-aliasing -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ignore-aliasing -disable-output < %s | FileCheck %s -check-prefix=SCALAR
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-ignore-aliasing -disable-output < %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; SCALAR: if (
diff --git a/polly/test/CodeGen/udiv_expansion_position.ll b/polly/test/CodeGen/udiv_expansion_position.ll
index bb37fed4a41e..354e3cd18010 100644
--- a/polly/test/CodeGen/udiv_expansion_position.ll
+++ b/polly/test/CodeGen/udiv_expansion_position.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s | FileCheck %s
 ;
 ; Verify we do not crash when we synthezise code for the udiv in the SCoP.
 ;
diff --git a/polly/test/CodeGen/uninitialized_scalar_memory.ll b/polly/test/CodeGen/uninitialized_scalar_memory.ll
index 935ccc3d6289..e08af07e604e 100644
--- a/polly/test/CodeGen/uninitialized_scalar_memory.ll
+++ b/polly/test/CodeGen/uninitialized_scalar_memory.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s | FileCheck %s
 ;
 ; Verify we initialize the scalar locations reserved for the incoming phi
 ; values.
diff --git a/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll b/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll
index 9164bb4532e6..46706804a81b 100644
--- a/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll
+++ b/polly/test/CodeGen/unpredictable-loop-unsynthesizable.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops \
-; RUN: -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-codegen \
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' \
+; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-codegen \
 ; RUN: -polly-invariant-load-hoisting=true -disable-output < %s
 
 ; The loop for.body is a scop with invariant load hoisting, but does not
diff --git a/polly/test/CodeGen/variant_load_empty_domain.ll b/polly/test/CodeGen/variant_load_empty_domain.ll
index f5ad0b195818..6f2d3dc582db 100644
--- a/polly/test/CodeGen/variant_load_empty_domain.ll
+++ b/polly/test/CodeGen/variant_load_empty_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
 ;
 ;
 ;    void f(int *A) {
diff --git a/polly/test/CodeGen/whole-scop-non-affine-subregion.ll b/polly/test/CodeGen/whole-scop-non-affine-subregion.ll
index 931e644f6b8f..b342b1cb5aa2 100644
--- a/polly/test/CodeGen/whole-scop-non-affine-subregion.ll
+++ b/polly/test/CodeGen/whole-scop-non-affine-subregion.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly \
-; RUN: -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly \
+; RUN: -passes=polly-codegen -S < %s | FileCheck %s
 
 ; CHECK: polly.start
 ;    int /* pure */ g()
diff --git a/polly/test/DeLICM/confused_order.ll b/polly/test/DeLICM/confused_order.ll
index 2015ebcf58f1..0c19eb6aa605 100644
--- a/polly/test/DeLICM/confused_order.ll
+++ b/polly/test/DeLICM/confused_order.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-delicm -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-delicm -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s -check-prefix=REMARKS
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-delicm>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-delicm' -polly-import-jscop-postfix=transformed -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s -check-prefix=REMARKS
 ;
 ; ForwardOptree changes the SCoP and may already map some accesses.
 ; DeLICM must be prepared to encounter implicit reads
diff --git a/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll b/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll
index 4e039b22b415..66d9ae889e65 100644
--- a/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll
+++ b/polly/test/DeLICM/contradicting_assumed_context_and_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ; The domain of bb14 contradicts the SCoP's assumptions. This leads to
 ; 'anything goes' inside the statement since it is never executed,
diff --git a/polly/test/DeLICM/load-in-cond-inf-loop.ll b/polly/test/DeLICM/load-in-cond-inf-loop.ll
index f0aecfd87a15..f6e23110aa6f 100644
--- a/polly/test/DeLICM/load-in-cond-inf-loop.ll
+++ b/polly/test/DeLICM/load-in-cond-inf-loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 
 ; When %b is 0, %for.body13 is an infite loop. In this case the loaded
 ; value %1 is not used anywhere.
diff --git a/polly/test/DeLICM/map_memset_zero.ll b/polly/test/DeLICM/map_memset_zero.ll
index 1a08eee63fe9..9a8e5989fdad 100644
--- a/polly/test/DeLICM/map_memset_zero.ll
+++ b/polly/test/DeLICM/map_memset_zero.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output < %s | FileCheck -match-full-lines %s
-; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-delicm>)" -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Check that PHI mapping works even in presence of a memset whose'
 ; zero value is used.
diff --git a/polly/test/DeLICM/nomap_alreadymapped.ll b/polly/test/DeLICM/nomap_alreadymapped.ll
index 7adf4ba88385..da5f4ec24a47 100644
--- a/polly/test/DeLICM/nomap_alreadymapped.ll
+++ b/polly/test/DeLICM/nomap_alreadymapped.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/nomap_escaping.ll b/polly/test/DeLICM/nomap_escaping.ll
index 034c0a96ccf2..60955368fe59 100644
--- a/polly/test/DeLICM/nomap_escaping.ll
+++ b/polly/test/DeLICM/nomap_escaping.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/nomap_occupied.ll b/polly/test/DeLICM/nomap_occupied.ll
index db33532b1e65..9ba8ce264123 100644
--- a/polly/test/DeLICM/nomap_occupied.ll
+++ b/polly/test/DeLICM/nomap_occupied.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/nomap_readonly.ll b/polly/test/DeLICM/nomap_readonly.ll
index 1f3b5746fe9b..7a185d336bad 100644
--- a/polly/test/DeLICM/nomap_readonly.ll
+++ b/polly/test/DeLICM/nomap_readonly.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      fsomeval = 21.0 + 21.0;
diff --git a/polly/test/DeLICM/nomap_spuriouswrite.ll b/polly/test/DeLICM/nomap_spuriouswrite.ll
index ef470f715bbe..0ed7f6ee8e23 100644
--- a/polly/test/DeLICM/nomap_spuriouswrite.ll
+++ b/polly/test/DeLICM/nomap_spuriouswrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/nomap_storagesize.ll b/polly/test/DeLICM/nomap_storagesize.ll
index fab8d54c2bdf..bf851ac342d2 100644
--- a/polly/test/DeLICM/nomap_storagesize.ll
+++ b/polly/test/DeLICM/nomap_storagesize.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ;    void func(float *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/nomap_writewrite.ll b/polly/test/DeLICM/nomap_writewrite.ll
index 06192d9ae19e..9fcd52aad743 100644
--- a/polly/test/DeLICM/nomap_writewrite.ll
+++ b/polly/test/DeLICM/nomap_writewrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/outofquota-reverseDomain.ll b/polly/test/DeLICM/outofquota-reverseDomain.ll
index d40ee03cf3bc..1f7527c84120 100644
--- a/polly/test/DeLICM/outofquota-reverseDomain.ll
+++ b/polly/test/DeLICM/outofquota-reverseDomain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-delicm-max-ops=1000000 -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-delicm-max-ops=1000000 '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ; This causes an assertion to fail on out-of-quota after 1000000 operations.
 ; (The error was specific to -polly-delicm-max-ops=1000000 and changes
diff --git a/polly/test/DeLICM/pass_existence.ll b/polly/test/DeLICM/pass_existence.ll
index 7ed2da9c1da1..64302d998326 100644
--- a/polly/test/DeLICM/pass_existence.ll
+++ b/polly/test/DeLICM/pass_existence.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-delicm -disable-output < %s
-; RUN: opt %loadPolly -polly-print-delicm -disable-output < %s | FileCheck %s
-; RUN: opt %loadNPMPolly "-passes=scop(print<polly-delicm>)" -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-delicm -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=scop(print<polly-delicm>)' -disable-output < %s | FileCheck %s
 ;
 ; Simple test for the existence of the DeLICM pass.
 ;
diff --git a/polly/test/DeLICM/pr41656.ll b/polly/test/DeLICM/pr41656.ll
index 965ad9f62ac3..d7cfde35a6e8 100644
--- a/polly/test/DeLICM/pr41656.ll
+++ b/polly/test/DeLICM/pr41656.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,scop(print<polly-delicm>)' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; llvm.org/PR41656
 ;
diff --git a/polly/test/DeLICM/pr48783.ll b/polly/test/DeLICM/pr48783.ll
index 3cbd54b93baf..e3c3eb6a19cc 100644
--- a/polly/test/DeLICM/pr48783.ll
+++ b/polly/test/DeLICM/pr48783.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,scop(print<polly-delicm>)' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; llvm.org/PR48783
 ;
diff --git a/polly/test/DeLICM/reduction.ll b/polly/test/DeLICM/reduction.ll
index 78c1a4ce5288..29b7a3617300 100644
--- a/polly/test/DeLICM/reduction.ll
+++ b/polly/test/DeLICM/reduction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true -polly-print-delicm -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll
index b5bc0d589c65..d9c5268e631d 100644
--- a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll
+++ b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Load (but not store) of A[j] hoisted, reduction only over some iterations.
 ;
diff --git a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll
index e995be1143a6..6a4223f5af65 100644
--- a/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll
+++ b/polly/test/DeLICM/reduction_looprotate_gvnpre_cond2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Load (but not store) of A[j] hoisted, reduction not written in all iterations.
 ; FIXME: %join is not mapped because the MemoryKind::Value mapping does not
diff --git a/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll b/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll
index ca3a1211ca49..bf4b8018d552 100644
--- a/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll
+++ b/polly/test/DeLICM/reduction_looprotate_gvnpre_nopreheader.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Hosted reduction load (but not the store) without preheader.
 ;
diff --git a/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll b/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll
index 41538239fbd8..027df44e8619 100644
--- a/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll
+++ b/polly/test/DeLICM/reduction_looprotate_licm_nopreheader.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 ;
 ; Register-promoted reduction but without preheader.
 ;
diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll
index 35c723e864d2..4ea3fa53a339 100644
--- a/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll
+++ b/polly/test/DeLICM/reduction_looprotate_loopguard_gvnpre.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Reduction over parametric number of elements and a loopguard if the
 ; reduction loop is not executed at all. Load hoisted before loop.
diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll
index 2b5f4d8151a8..2e7abe444ad6 100644
--- a/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll
+++ b/polly/test/DeLICM/reduction_looprotate_loopguard_licm1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Reduction over parametric number of elements and a loopguard if the
 ; reduction loop is not executed at all.
diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll
index 2e92813d5551..60afdeb5fc97 100644
--- a/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll
+++ b/polly/test/DeLICM/reduction_looprotate_loopguard_licm2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Reduction over parametric number of elements and a loopguard if the
 ; reduction loop is not executed at all, such that A[j] is also not written to.
diff --git a/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll b/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll
index 784c8ef2d321..e63b457de92d 100644
--- a/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll
+++ b/polly/test/DeLICM/reduction_looprotate_loopguard_licm3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Reduction over parametric number of elements and a loopguard if the
 ; reduction loop is not executed at all, such that A[j] is also not accessed.
diff --git a/polly/test/DeLICM/reduction_unrelatedunusual.ll b/polly/test/DeLICM/reduction_unrelatedunusual.ll
index 04c437770700..97826f603e5d 100644
--- a/polly/test/DeLICM/reduction_unrelatedunusual.ll
+++ b/polly/test/DeLICM/reduction_unrelatedunusual.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true -polly-print-delicm -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-delicm-partial-writes=true '-passes=print<polly-delicm>' -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Map %add and %phi to A[j].
 ; The non-analyzable store to C[0] is unrelated and can be ignored.
diff --git a/polly/test/DeLICM/reject_loadafterstore.ll b/polly/test/DeLICM/reject_loadafterstore.ll
index 8af6e5e4818c..4460620852a8 100644
--- a/polly/test/DeLICM/reject_loadafterstore.ll
+++ b/polly/test/DeLICM/reject_loadafterstore.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output -pass-remarks-missed=polly-delicm < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reject_outofquota.ll b/polly/test/DeLICM/reject_outofquota.ll
index 551431f0823c..820679a5349d 100644
--- a/polly/test/DeLICM/reject_outofquota.ll
+++ b/polly/test/DeLICM/reject_outofquota.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-delicm -pass-remarks-analysis=polly-delicm -polly-delicm-max-ops=1 -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadPolly -polly-delicm -polly-print-dependences -polly-delicm-max-ops=1 -polly-dependences-computeout=0 -disable-output < %s | FileCheck %s -check-prefix=DEP
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-analysis=polly-delicm -polly-delicm-max-ops=1 -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-delicm,print<polly-dependences>' -polly-delicm-max-ops=1 -polly-dependences-computeout=0 -disable-output < %s | FileCheck %s -check-prefix=DEP
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reject_storeafterstore.ll b/polly/test/DeLICM/reject_storeafterstore.ll
index 1ec5ef67344c..ddd13dad2ed3 100644
--- a/polly/test/DeLICM/reject_storeafterstore.ll
+++ b/polly/test/DeLICM/reject_storeafterstore.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reject_storeinsubregion.ll b/polly/test/DeLICM/reject_storeinsubregion.ll
index 1d38e8066568..c987156b51cd 100644
--- a/polly/test/DeLICM/reject_storeinsubregion.ll
+++ b/polly/test/DeLICM/reject_storeinsubregion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/reject_unusualstore.ll b/polly/test/DeLICM/reject_unusualstore.ll
index a18a0c3ce9c4..342888c6654f 100644
--- a/polly/test/DeLICM/reject_unusualstore.ll
+++ b/polly/test/DeLICM/reject_unusualstore.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-delicm -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-delicm -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=STATS
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-delicm -disable-output -stats < %s 2>&1 | FileCheck %s --check-prefix=STATS
 ; REQUIRES: asserts
 ;
 ;    void func(double *A) {
diff --git a/polly/test/DeLICM/skip_maywrite.ll b/polly/test/DeLICM/skip_maywrite.ll
index 1e5f6b169fe4..0d30791cd94e 100644
--- a/polly/test/DeLICM/skip_maywrite.ll
+++ b/polly/test/DeLICM/skip_maywrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeLICM/skip_multiaccess.ll b/polly/test/DeLICM/skip_multiaccess.ll
index 6a8c8e5325e1..a7c79f752463 100644
--- a/polly/test/DeLICM/skip_multiaccess.ll
+++ b/polly/test/DeLICM/skip_multiaccess.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; llvm.org/PR34485
 ; llvm.org/PR34989
diff --git a/polly/test/DeLICM/skip_notinloop.ll b/polly/test/DeLICM/skip_notinloop.ll
index 0730a3a9a4f5..8e265e19aefe 100644
--- a/polly/test/DeLICM/skip_notinloop.ll
+++ b/polly/test/DeLICM/skip_notinloop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      double phi = 0.0;
diff --git a/polly/test/DeLICM/skip_scalaraccess.ll b/polly/test/DeLICM/skip_scalaraccess.ll
index fa95d382409a..2cf13afe11cd 100644
--- a/polly/test/DeLICM/skip_scalaraccess.ll
+++ b/polly/test/DeLICM/skip_scalaraccess.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -pass-remarks-missed=polly-delicm -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void func(double *A) {
 ;      for (int j = 0; j < 2; j += 1) { /* outer */
diff --git a/polly/test/DeadCodeElimination/chained_iterations.ll b/polly/test/DeadCodeElimination/chained_iterations.ll
index b79fdd659aae..f3bf07bb40d8 100644
--- a/polly/test/DeadCodeElimination/chained_iterations.ll
+++ b/polly/test/DeadCodeElimination/chained_iterations.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S %loadPolly -basic-aa -polly-dependences-analysis-type=value-based -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt -S %loadPolly -basic-aa -polly-dependences-analysis-type=value-based -polly-dce -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 ;
 ; for(i = 0; i < 200; i++ )
diff --git a/polly/test/DeadCodeElimination/chained_iterations_2.ll b/polly/test/DeadCodeElimination/chained_iterations_2.ll
index 1d1af92db5da..52f034f0e56c 100644
--- a/polly/test/DeadCodeElimination/chained_iterations_2.ll
+++ b/polly/test/DeadCodeElimination/chained_iterations_2.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S %loadPolly -basic-aa -polly-dependences-analysis-type=value-based -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt -S %loadPolly -basic-aa -polly-dependences-analysis-type=value-based -polly-dce -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 ;
 ; for(i = 0; i < 200; i++ )
diff --git a/polly/test/DeadCodeElimination/computeout.ll b/polly/test/DeadCodeElimination/computeout.ll
index 51850d7da349..e54df42ed1db 100644
--- a/polly/test/DeadCodeElimination/computeout.ll
+++ b/polly/test/DeadCodeElimination/computeout.ll
@@ -1,6 +1,5 @@
-; RUN: opt -S %loadPolly -basic-aa -polly-dce -polly-print-ast -disable-output < %s | FileCheck %s
 ; RUN: opt -S %loadNPMPolly "-passes=scop(polly-dce,print<polly-ast>)" < %s | FileCheck %s
-; RUN: opt -S %loadPolly -basic-aa -polly-dce -polly-print-ast -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa "-passes=scop(polly-dce,print<polly-ast>)" -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ;     for(i = 0; i < 100; i++ )
diff --git a/polly/test/DeadCodeElimination/dead_iteration_elimination.ll b/polly/test/DeadCodeElimination/dead_iteration_elimination.ll
index f496f7828e3d..c102f60abb65 100644
--- a/polly/test/DeadCodeElimination/dead_iteration_elimination.ll
+++ b/polly/test/DeadCodeElimination/dead_iteration_elimination.ll
@@ -1,4 +1,3 @@
-; RUN: opt -S %loadPolly -basic-aa -polly-dependences-analysis-type=value-based -polly-dce -polly-dce-precise-steps=2 -polly-print-ast -disable-output < %s | FileCheck %s
 ; RUN: opt -S %loadNPMPolly "-passes=scop(polly-dce,print<polly-ast>)" -polly-dependences-analysis-type=value-based -polly-dce-precise-steps=2 < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 ;
diff --git a/polly/test/DeadCodeElimination/non-affine-affine-mix.ll b/polly/test/DeadCodeElimination/non-affine-affine-mix.ll
index e6a5dd204ca1..36f55476fed2 100644
--- a/polly/test/DeadCodeElimination/non-affine-affine-mix.ll
+++ b/polly/test/DeadCodeElimination/non-affine-affine-mix.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-dce -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ;    void f(int *A) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/DeadCodeElimination/non-affine.ll b/polly/test/DeadCodeElimination/non-affine.ll
index 38a7fcbcf9c9..ef528b4124c6 100644
--- a/polly/test/DeadCodeElimination/non-affine.ll
+++ b/polly/test/DeadCodeElimination/non-affine.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-dce -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK: for (int c0 = 0; c0 <= 1023; c0 += 1)
 ;
diff --git a/polly/test/DeadCodeElimination/null_schedule.ll b/polly/test/DeadCodeElimination/null_schedule.ll
index 633a84b5d92b..01d34e95629b 100644
--- a/polly/test/DeadCodeElimination/null_schedule.ll
+++ b/polly/test/DeadCodeElimination/null_schedule.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S %loadPolly -basic-aa -polly-dependences-analysis-type=value-based -polly-dce -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-dependences-analysis-type=value-based '-passes=polly-dce,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-DCE
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 ; A[0] = 1;
 ;
diff --git a/polly/test/DependenceInfo/computeout.ll b/polly/test/DependenceInfo/computeout.ll
index 048de29864d3..c2a3456b3dc8 100644
--- a/polly/test/DependenceInfo/computeout.ll
+++ b/polly/test/DependenceInfo/computeout.ll
@@ -1,7 +1,5 @@
-; RUN: opt -S %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s -check-prefix=VALUE
-; RUN: opt -S %loadPolly -polly-print-function-dependences -disable-output < %s | FileCheck %s -check-prefix=FUNC-VALUE
-; RUN: opt -S %loadPolly -polly-print-dependences -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
-; RUN: opt -S %loadPolly -polly-print-function-dependences -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
+; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s -check-prefix=VALUE
+; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ;     for(i = 0; i < 100; i++ )
diff --git a/polly/test/DependenceInfo/different_schedule_dimensions.ll b/polly/test/DependenceInfo/different_schedule_dimensions.ll
index 3f966168d3b7..f89791f42f9d 100644
--- a/polly/test/DependenceInfo/different_schedule_dimensions.ll
+++ b/polly/test/DependenceInfo/different_schedule_dimensions.ll
@@ -1,7 +1,5 @@
-; RUN: opt -S %loadPolly -polly-print-dependences \
+; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' \
 ; RUN:                   -disable-output < %s | FileCheck %s
-; RUN: opt -S %loadPolly -polly-print-function-dependences \
-; RUN:                   -disable-output < %s | FileCheck %s -check-prefix=FUNC
 
 ; CHECK: RAW dependences:
 ; CHECK:   { Stmt_bb9[0] -> Stmt_bb10[0] }
diff --git a/polly/test/DependenceInfo/do_pluto_matmult.ll b/polly/test/DependenceInfo/do_pluto_matmult.ll
index d71608e80e70..b88cf9bf5475 100644
--- a/polly/test/DependenceInfo/do_pluto_matmult.ll
+++ b/polly/test/DependenceInfo/do_pluto_matmult.ll
@@ -1,7 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-dependences -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE
-; RUN: opt %loadPolly -basic-aa -polly-print-dependences -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY
-; RUN: opt %loadPolly -basic-aa -polly-print-function-dependences -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=FUNC-VALUE
-; RUN: opt %loadPolly -basic-aa -polly-print-function-dependences -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=FUNC-MEMORY
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/DependenceInfo/fine_grain_dep_0.ll b/polly/test/DependenceInfo/fine_grain_dep_0.ll
index 9c79e360690a..f93814c1c4be 100644
--- a/polly/test/DependenceInfo/fine_grain_dep_0.ll
+++ b/polly/test/DependenceInfo/fine_grain_dep_0.ll
@@ -1,7 +1,6 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-dependences -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s --check-prefix=REF
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-dependences -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s --check-prefix=ACC
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-function-dependences -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s --check-prefix=ACC
-;
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s --check-prefix=REF
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s --check-prefix=ACC
+
 ; REF:      RAW dependences:
 ; REF-NEXT:     [N] -> { [Stmt_for_body[i0] -> MemRef_b[]] -> [Stmt_for_body[6 + i0] -> MemRef_b[]] : 0 <= i0 <= -13 + N; Stmt_for_body[i0] -> Stmt_for_body[6 + i0] : 0 <= i0 <= -13 + N; Stmt_for_body[i0] -> Stmt_for_body[4 + i0] : 0 <= i0 <= -11 + N; [Stmt_for_body[i0] -> MemRef_a[]] -> [Stmt_for_body[4 + i0] -> MemRef_a[]] : 0 <= i0 <= -11 + N }
 ; REF-NEXT: WAR dependences:
diff --git a/polly/test/DependenceInfo/generate_may_write_dependence_info.ll b/polly/test/DependenceInfo/generate_may_write_dependence_info.ll
index 0b7f2d48da9f..677323495476 100644
--- a/polly/test/DependenceInfo/generate_may_write_dependence_info.ll
+++ b/polly/test/DependenceInfo/generate_may_write_dependence_info.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s -check-prefix=VALUE
+; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s -check-prefix=VALUE
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 ;                     for (int i = 0; i < N; i++) {
diff --git a/polly/test/DependenceInfo/infeasible_context.ll b/polly/test/DependenceInfo/infeasible_context.ll
index d701b821e15c..cde3102dc3dc 100644
--- a/polly/test/DependenceInfo/infeasible_context.ll
+++ b/polly/test/DependenceInfo/infeasible_context.ll
@@ -1,10 +1,9 @@
-; RUN: opt %loadPolly -polly-print-function-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN:  | FileCheck %s -check-prefix=FUNC-SCOP
-; RUN: opt %loadPolly -polly-print-function-dependences -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(print<polly-dependences>)' -disable-output < %s 2>&1 \
 ; RUN:  | FileCheck %s -check-prefix=FUNC-DEPS
 ;
 ; FUNC-SCOP-NOT: Statement
-; FUNC-DEPS-LABEL: Printing analysis 'Polly - Calculate dependences for all the SCoPs of a function' for function 'readgeo'
 ; FUNC-DEPS-NOT: RAW dependences
 ;
 ; Due to an infeasible run-time check, scop object is empty and we do not compute dependences.
diff --git a/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll b/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll
index 09c516274708..392a34769cdd 100644
--- a/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll
+++ b/polly/test/DependenceInfo/may_writes_do_not_block_must_writes_for_war.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; Verify that the presence of a may-write (S1) between a read (S0) and a
 ; must-write (S2) does not block the generation of RAW dependences. This makes
diff --git a/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll b/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll
index 25c7e3d6e442..ae5fd3beed39 100644
--- a/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll
+++ b/polly/test/DependenceInfo/nonaffine-condition-buildMemoryAccess.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -polly-allow-nonaffine-loops -polly-allow-nonaffine -debug-only=polly-dependence < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-allow-nonaffine-loops -polly-allow-nonaffine -debug-only=polly-dependence < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 ; CHECK:        MayWriteAccess :=   [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/DependenceInfo/reduction_complex_location.ll b/polly/test/DependenceInfo/reduction_complex_location.ll
index 7ca839996326..7722ee974c3f 100644
--- a/polly/test/DependenceInfo/reduction_complex_location.ll
+++ b/polly/test/DependenceInfo/reduction_complex_location.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-dependences -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-dependences -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     {  }
diff --git a/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll b/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll
index 3632bd202da2..840d1f32dca3 100644
--- a/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll
+++ b/polly/test/DependenceInfo/reduction_dependences_equal_non_reduction_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; This loopnest contains a reduction which imposes the same dependences as the
 ; accesses to the array A. We need to ensure we keep the dependences of A.
diff --git a/polly/test/DependenceInfo/reduction_dependences_not_null.ll b/polly/test/DependenceInfo/reduction_dependences_not_null.ll
index 69fd74478ecc..56d84a9aec6d 100644
--- a/polly/test/DependenceInfo/reduction_dependences_not_null.ll
+++ b/polly/test/DependenceInfo/reduction_dependences_not_null.ll
@@ -1,7 +1,7 @@
 ; Test that the reduction dependences are always initialised, even in a case
 ; where we have no reduction. If this object is NULL, then isl operations on
 ; it will fail.
-; RUN: opt -S %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s -check-prefix=VALUE
+; RUN: opt -S %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s -check-prefix=VALUE
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ;     for(i = 0; i < 100; i++ )
diff --git a/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll b/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll
index 71903d9e7111..76c7fc64ae89 100644
--- a/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll
+++ b/polly/test/DependenceInfo/reduction_mixed_reduction_and_non_reduction_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_for_body3[i0, i1] -> Stmt_for_body3[i0 + i1, o1] : i0 >= 0 and 0 <= i1 <= 1023 - i0 and i1 <= 1 and 0 < o1 <= 511 }
diff --git a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll
index 234de5c367a0..02b814a0d7c0 100644
--- a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll
+++ b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum.ll
@@ -1,6 +1,6 @@
-; RUN: opt -basic-aa %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
-; RUN: opt -basic-aa %loadPolly -polly-print-dependences -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s
-; RUN: opt -basic-aa %loadPolly -polly-print-dependences -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=reference-wise -disable-output < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-dependences>' -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s
 ;
 ; Verify that only the inner reduction like accesses cause reduction dependences
 ;
diff --git a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll
index acd674dc0117..91bd35deebd0 100644
--- a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll
+++ b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -basic-aa -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     {  }
diff --git a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll
index bdfcfc99c8cb..040d51378239 100644
--- a/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll
+++ b/polly/test/DependenceInfo/reduction_multiple_loops_array_sum_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -basic-aa -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      Reduction dependences:
 ; CHECK-NEXT:     { Stmt_for_inc[i0, i1] -> Stmt_for_inc[i0, 1 + i1] : 0 <= i0 <= 99 and 0 <= i1 <= 98 }
diff --git a/polly/test/DependenceInfo/reduction_multiple_reductions.ll b/polly/test/DependenceInfo/reduction_multiple_reductions.ll
index cf705080e03d..527a8cfc3556 100644
--- a/polly/test/DependenceInfo/reduction_multiple_reductions.ll
+++ b/polly/test/DependenceInfo/reduction_multiple_reductions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; Verify we do not have dependences between the if and the else clause
 ;
diff --git a/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll b/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll
index 8d8557a129ab..fb5fd96a2e42 100644
--- a/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll
+++ b/polly/test/DependenceInfo/reduction_multiple_reductions_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ;
 ; These are the important RAW dependences, as they need to originate/end in only one iteration:
diff --git a/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll b/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll
index 7b4a68a2a897..3ec3920268b4 100644
--- a/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll
+++ b/polly/test/DependenceInfo/reduction_only_reduction_like_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; FIXME: Change the comment once we allow different pointers
 ; The statement is "almost" reduction like but should not yield any reduction dependences
diff --git a/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll b/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll
index 0d09e5a861a0..23bd8ef25bd7 100644
--- a/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll
+++ b/polly/test/DependenceInfo/reduction_partially_escaping_intermediate_in_other_stmt.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -basic-aa -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      Reduction dependences:
 ; CHECK-NEXT:     [N] -> { Stmt_for_body3[i0, i1] -> Stmt_for_body3[i0, 1 + i1] : 0 <= i0 <= 1023 and i1 >= 0 and 1024 - N + i0 <= i1 <= 1022 }
diff --git a/polly/test/DependenceInfo/reduction_privatization_deps.ll b/polly/test/DependenceInfo/reduction_privatization_deps.ll
index ce90e21a898d..0e0f71737ffd 100644
--- a/polly/test/DependenceInfo/reduction_privatization_deps.ll
+++ b/polly/test/DependenceInfo/reduction_privatization_deps.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_S1[i0, i1] -> Stmt_S2[-1 + i0 + i1] : 0 <= i0 <= 1023 and i1 >= 0 and -i0 < i1 <= 1024 - i0 and i1 <= 1023; Stmt_S0[i0] -> Stmt_S1[o0, i0 - o0] : i0 <= 1023 and 0 <= o0 <= i0 }
diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_2.ll b/polly/test/DependenceInfo/reduction_privatization_deps_2.ll
index 4904004d4781..cafa319e2cc7 100644
--- a/polly/test/DependenceInfo/reduction_privatization_deps_2.ll
+++ b/polly/test/DependenceInfo/reduction_privatization_deps_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; We have privatization dependences from a textually later statement to a
 ; textually earlier one, but the dependences still go forward in time.
diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_3.ll b/polly/test/DependenceInfo/reduction_privatization_deps_3.ll
index a3935ebd6cc4..d86da92fbcab 100644
--- a/polly/test/DependenceInfo/reduction_privatization_deps_3.ll
+++ b/polly/test/DependenceInfo/reduction_privatization_deps_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_S1[i0] -> Stmt_S3[2 + i0] : 0 <= i0 <= 96; Stmt_S2[i0, i1] -> Stmt_S3[o0] : i1 <= 1 - i0 and -i1 < o0 <= 1 and o0 <= 1 + i0 - i1; Stmt_S3[i0] -> Stmt_S2[o0, 1 - i0] : 0 <= i0 <= 1 and i0 < o0 <= 98 }
diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_4.ll b/polly/test/DependenceInfo/reduction_privatization_deps_4.ll
index 10d726af5145..d84c04fc309b 100644
--- a/polly/test/DependenceInfo/reduction_privatization_deps_4.ll
+++ b/polly/test/DependenceInfo/reduction_privatization_deps_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_S1[i0] -> Stmt_S2[i0, i0] : 0 <= i0 <= 98; Stmt_S2[i0, i0] -> Stmt_S3[i0] : 0 <= i0 <= 98; Stmt_S3[i0] -> Stmt_S2[o0, i0] : i0 >= 0 and i0 < o0 <= 98; Stmt_S2[i0, i1] -> Stmt_S1[i1] : i0 >= 0 and i0 < i1 <= 98 }
diff --git a/polly/test/DependenceInfo/reduction_privatization_deps_5.ll b/polly/test/DependenceInfo/reduction_privatization_deps_5.ll
index e8d51181725e..592c7238c3c5 100644
--- a/polly/test/DependenceInfo/reduction_privatization_deps_5.ll
+++ b/polly/test/DependenceInfo/reduction_privatization_deps_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_S1[i0, 0] -> Stmt_S2[i0, 0] : 0 <= i0 <= 98; Stmt_S2[i0, 0] -> Stmt_S1[1 + i0, 0] : 0 <= i0 <= 97 }
diff --git a/polly/test/DependenceInfo/reduction_sequence.ll b/polly/test/DependenceInfo/reduction_sequence.ll
index 4a4688953938..7ce9d37d395b 100644
--- a/polly/test/DependenceInfo/reduction_sequence.ll
+++ b/polly/test/DependenceInfo/reduction_sequence.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 
 ;    void manyreductions(long *A) {
 ;      for (long i = 0; i < 1024; i++)
diff --git a/polly/test/DependenceInfo/reduction_simple_iv.ll b/polly/test/DependenceInfo/reduction_simple_iv.ll
index e3307afae08b..d13d14ecaad9 100644
--- a/polly/test/DependenceInfo/reduction_simple_iv.ll
+++ b/polly/test/DependenceInfo/reduction_simple_iv.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     {  }
diff --git a/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll b/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll
index c7651c39a563..4c97fbb1aacb 100644
--- a/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll
+++ b/polly/test/DependenceInfo/reduction_simple_iv_debug_wrapped_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -debug-only=polly-dependence -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -debug-only=polly-dependence -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll b/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll
index b61fd8453a8c..804005cf72a7 100644
--- a/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll
+++ b/polly/test/DependenceInfo/reduction_simple_privatization_deps_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     { Stmt_S1[i0, i1] -> Stmt_S2[i0] : 0 <= i0 <= 99 and 0 <= i1 <= 99; Stmt_S0[i0] -> Stmt_S1[i0, o1] : 0 <= i0 <= 99 and 0 <= o1 <= 99; Stmt_S2[i0] -> Stmt_S0[1 + i0] : 0 <= i0 <= 98 }
diff --git a/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll b/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll
index a3a87c70d905..9596827b4cbb 100644
--- a/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll
+++ b/polly/test/DependenceInfo/reduction_simple_privatization_deps_w_parameter.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     [N] -> { Stmt_S1[i0] -> Stmt_S2[] : N >= 11 and 0 <= i0 <= 1023; Stmt_S0[] -> Stmt_S1[o0] : N >= 11 and 0 <= o0 <= 1023 }
diff --git a/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll b/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll
index c90462962ce0..d67683d11a4b 100644
--- a/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll
+++ b/polly/test/DependenceInfo/reduction_two_reductions_different_rloops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-dependences -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s
 ;
 ; CHECK:      RAW dependences:
 ; CHECK-NEXT:     {  }
diff --git a/polly/test/DependenceInfo/sequential_loops.ll b/polly/test/DependenceInfo/sequential_loops.ll
index 8dfa13cb9db8..6ae720030332 100644
--- a/polly/test/DependenceInfo/sequential_loops.ll
+++ b/polly/test/DependenceInfo/sequential_loops.ll
@@ -1,34 +1,43 @@
-; RUN: opt -S %loadPolly -basic-aa -polly-print-dependences -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE
-; RUN: opt -S %loadPolly -basic-aa -polly-print-dependences -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY
-; RUN: opt -S %loadPolly -basic-aa -polly-print-dependences -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s -check-prefix=VALUE_ACCESS
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -disable-output < %s | FileCheck %s -check-prefix=VALUE
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=memory-based -disable-output < %s | FileCheck %s -check-prefix=MEMORY
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-dependences>' -polly-dependences-analysis-type=value-based -polly-dependences-analysis-level=access-wise -disable-output < %s | FileCheck %s -check-prefix=VALUE_ACCESS
 
-; VALUE-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.3' in function 'sequential_writes':
-; VALUE-NEXT:      RAW dependences:
+; VALUE:      RAW dependences:
 ; VALUE-NEXT:          {  }
 ; VALUE-NEXT:      WAR dependences:
 ; VALUE-NEXT:          {  }
 ; VALUE-NEXT:      WAW dependences:
 ; VALUE-NEXT:          { Stmt_S1[i0] -> Stmt_S3[i0] : 10 <= i0 <= 99; Stmt_S1[i0] -> Stmt_S2[i0] : 0 <= i0 <= 9; Stmt_S2[i0] -> Stmt_S3[i0] : 0 <= i0 <= 9 }
 ;
-;VALUE_ACCESS-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.3' in function 'sequential_writes':
-;VALUE_ACCESS-NEXT:        RAW dependences:
-;VALUE_ACCESS-NEXT:                {  }
-;VALUE_ACCESS-NEXT:        WAR dependences:
-;VALUE_ACCESS-NEXT:                {  }
-;VALUE_ACCESS-NEXT:        WAW dependences:
-;VALUE_ACCESS-NEXT:                { Stmt_S1[i0] -> Stmt_S3[i0] : 10 <= i0 <= 99; Stmt_S1[i0] -> Stmt_S2[i0] : 0 <= i0 <= 9; [Stmt_S2[i0] -> Stmt_S2_Write0[]] -> [Stmt_S3[i0] -> Stmt_S3_Write0[]] : 0 <= i0 <= 9; Stmt_S2[i0] -> Stmt_S3[i0] : 0 <= i0 <= 9; [Stmt_S1[i0] -> Stmt_S1_Write0[]] -> [Stmt_S2[i0] -> Stmt_S2_Write0[]] : 0 <= i0 <= 9; [Stmt_S1[i0] -> Stmt_S1_Write0[]] -> [Stmt_S3[i0] -> Stmt_S3_Write0[]] : 10 <= i0 <= 99 }
-
-;
-; VALUE-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.3' in function 'read_after_writes':
-; VALUE-NEXT:      RAW dependences:
+; VALUE:      RAW dependences:
 ; VALUE-NEXT:          { Stmt_S2[i0] -> Stmt_S3[i0] : 0 <= i0 <= 9; Stmt_S1[i0] -> Stmt_S3[i0] : 10 <= i0 <= 99 }
 ; VALUE-NEXT:      WAR dependences:
 ; VALUE-NEXT:          {  }
 ; VALUE-NEXT:      WAW dependences:
 ; VALUE-NEXT:          { Stmt_S1[i0] -> Stmt_S2[i0] : 0 <= i0 <= 9 }
 ;
-;VALUE_ACCESS-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.3' in function 'read_after_writes':
-;VALUE_ACCESS-NEXT:        RAW dependences:
+; VALUE:      RAW dependences:
+; VALUE-NEXT:          {  }
+; VALUE-NEXT:      WAR dependences:
+; VALUE-NEXT:          { Stmt_S1[i0] -> Stmt_S2[i0] : 0 <= i0 <= 9; Stmt_S1[i0] -> Stmt_S3[i0] : 10 <= i0 <= 99 }
+; VALUE-NEXT:      WAW dependences:
+; VALUE-NEXT:          { Stmt_S2[i0] -> Stmt_S3[i0] : 0 <= i0 <= 9 }
+;
+; VALUE:      RAW dependences:
+; VALUE-NEXT:          [p] -> { Stmt_S1[i0] -> Stmt_S2[-p + i0] : i0 >= p and 0 <= i0 <= 99 and i0 <= 9 + p }
+; VALUE-NEXT:      WAR dependences:
+; VALUE-NEXT:          [p] -> {  }
+; VALUE-NEXT:      WAW dependences:
+; VALUE-NEXT:          [p] -> {  }
+;
+;VALUE_ACCESS:        RAW dependences:
+;VALUE_ACCESS-NEXT:                {  }
+;VALUE_ACCESS-NEXT:        WAR dependences:
+;VALUE_ACCESS-NEXT:                {  }
+;VALUE_ACCESS-NEXT:        WAW dependences:
+;VALUE_ACCESS-NEXT:                { Stmt_S1[i0] -> Stmt_S3[i0] : 10 <= i0 <= 99; Stmt_S1[i0] -> Stmt_S2[i0] : 0 <= i0 <= 9; [Stmt_S2[i0] -> Stmt_S2_Write0[]] -> [Stmt_S3[i0] -> Stmt_S3_Write0[]] : 0 <= i0 <= 9; Stmt_S2[i0] -> Stmt_S3[i0] : 0 <= i0 <= 9; [Stmt_S1[i0] -> Stmt_S1_Write0[]] -> [Stmt_S2[i0] -> Stmt_S2_Write0[]] : 0 <= i0 <= 9; [Stmt_S1[i0] -> Stmt_S1_Write0[]] -> [Stmt_S3[i0] -> Stmt_S3_Write0[]] : 10 <= i0 <= 99 }
+;
+;VALUE_ACCESS:        RAW dependences:
 ;VALUE_ACCESS-NEXT:                { Stmt_S1[i0] -> Stmt_S3[i0] : 10 <= i0 <= 99; Stmt_S2[i0] -> Stmt_S3[i0] : 0 <= i0 <= 9; [Stmt_S2[i0] -> Stmt_S2_Write0[]] -> [Stmt_S3[i0] -> Stmt_S3_Read0[]] : 0 <= i0 <= 9; [Stmt_S1[i0] -> Stmt_S1_Write0[]] -> [Stmt_S3[i0] -> Stmt_S3_Read0[]] : 10 <= i0 <= 99 }
 
 ;VALUE_ACCESS-NEXT:        WAR dependences:
@@ -36,64 +45,42 @@
 ;VALUE_ACCESS-NEXT:        WAW dependences:
 ;VALUE_ACCESS-NEXT:                { [Stmt_S1[i0] -> Stmt_S1_Write0[]] -> [Stmt_S2[i0] -> Stmt_S2_Write0[]] : 0 <= i0 <= 9; Stmt_S1[i0] -> Stmt_S2[i0] : 0 <= i0 <= 9 }
 ;
-; VALUE-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.3' in function 'write_after_read':
-; VALUE-NEXT:      RAW dependences:
-; VALUE-NEXT:          {  }
-; VALUE-NEXT:      WAR dependences:
-; VALUE-NEXT:          { Stmt_S1[i0] -> Stmt_S2[i0] : 0 <= i0 <= 9; Stmt_S1[i0] -> Stmt_S3[i0] : 10 <= i0 <= 99 }
-; VALUE-NEXT:      WAW dependences:
-; VALUE-NEXT:          { Stmt_S2[i0] -> Stmt_S3[i0] : 0 <= i0 <= 9 }
-;
-;VALUE_ACCESS-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.3' in function 'write_after_read':
-;VALUE_ACCESS-NEXT:         RAW dependences:
+;VALUE_ACCESS:         RAW dependences:
 ;VALUE_ACCESS-NEXT:                 {  }
 ;VALUE_ACCESS-NEXT:         WAR dependences:
 ;VALUE_ACCESS-NEXT:                { Stmt_S1[i0] -> Stmt_S2[i0] : 0 <= i0 <= 9; Stmt_S1[i0] -> Stmt_S3[i0] : 10 <= i0 <= 99; [Stmt_S1[i0] -> Stmt_S1_Read0[]] -> [Stmt_S2[i0] -> Stmt_S2_Write0[]] : 0 <= i0 <= 9; [Stmt_S1[i0] -> Stmt_S1_Read0[]] -> [Stmt_S3[i0] -> Stmt_S3_Write0[]] : 10 <= i0 <= 99 }
 ;VALUE_ACCESS-NEXT:         WAW dependences:
 ;VALUE_ACCESS-NEXT:                { Stmt_S2[i0] -> Stmt_S3[i0] : 0 <= i0 <= 9; [Stmt_S2[i0] -> Stmt_S2_Write0[]] -> [Stmt_S3[i0] -> Stmt_S3_Write0[]] : 0 <= i0 <= 9 }
 ;
-; VALUE-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.2' in function 'parametric_offset':
-; VALUE-NEXT:      RAW dependences:
-; VALUE-NEXT:          [p] -> { Stmt_S1[i0] -> Stmt_S2[-p + i0] : i0 >= p and 0 <= i0 <= 99 and i0 <= 9 + p }
-; VALUE-NEXT:      WAR dependences:
-; VALUE-NEXT:          [p] -> {  }
-; VALUE-NEXT:      WAW dependences:
-; VALUE-NEXT:          [p] -> {  }
-;
-;VALUE_ACCESS-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.2' in function 'parametric_offset':
-;VALUE_ACCESS-NEXT:        RAW dependences:
+;VALUE_ACCESS:        RAW dependences:
 ;VALUE_ACCESS-NEXT:                [p] -> { Stmt_S1[i0] -> Stmt_S2[-p + i0] : i0 >= p and 0 <= i0 <= 99 and i0 <= 9 + p; [Stmt_S1[i0] -> Stmt_S1_Write0[]] -> [Stmt_S2[-p + i0] -> Stmt_S2_Read0[]] : i0 >= p and 0 <= i0 <= 99 and i0 <= 9 + p }
 ;VALUE_ACCESS-NEXT:        WAR dependences:
 ;VALUE_ACCESS-NEXT:                [p] -> {  }
 ;VALUE_ACCESS-NEXT:        WAW dependences:
 ;VALUE_ACCESS-NEXT:                [p] -> {  }
 
-; MEMORY-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.3' in function 'sequential_writes':
-; MEMORY-NEXT:      RAW dependences:
+; MEMORY:      RAW dependences:
 ; MEMORY-NEXT:          {  }
 ; MEMORY-NEXT:      WAR dependences:
 ; MEMORY-NEXT:          {  }
 ; MEMORY-NEXT:      WAW dependences:
 ; MEMORY-NEXT:          { Stmt_S1[i0] -> Stmt_S3[i0] : 0 <= i0 <= 99; Stmt_S1[i0] -> Stmt_S2[i0] : 0 <= i0 <= 9; Stmt_S2[i0] -> Stmt_S3[i0] : 0 <= i0 <= 9 }
 ;
-; MEMORY-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.3' in function 'read_after_writes':
-; MEMORY-NEXT:      RAW dependences:
+; MEMORY:      RAW dependences:
 ; MEMORY-NEXT:          { Stmt_S2[i0] -> Stmt_S3[i0] : 0 <= i0 <= 9; Stmt_S1[i0] -> Stmt_S3[i0] : 0 <= i0 <= 99 }
 ; MEMORY-NEXT:      WAR dependences:
 ; MEMORY-NEXT:          {  }
 ; MEMORY-NEXT:      WAW dependences:
 ; MEMORY-NEXT:          { Stmt_S1[i0] -> Stmt_S2[i0] : 0 <= i0 <= 9 }
 ;
-; MEMORY-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.3' in function 'write_after_read':
-; MEMORY-NEXT:      RAW dependences:
+; MEMORY:      RAW dependences:
 ; MEMORY-NEXT:          {  }
 ; MEMORY-NEXT:      WAR dependences:
 ; MEMORY-NEXT:          { Stmt_S1[i0] -> Stmt_S2[i0] : 0 <= i0 <= 9; Stmt_S1[i0] -> Stmt_S3[i0] : 0 <= i0 <= 99 }
 ; MEMORY-NEXT:      WAW dependences:
 ; MEMORY-NEXT:          { Stmt_S2[i0] -> Stmt_S3[i0] : 0 <= i0 <= 9 }
 ;
-; MEMORY-LABEL: Printing analysis 'Polly - Calculate dependences' for region: 'S1 => exit.2' in function 'parametric_offset':
-; MEMORY-NEXT:      RAW dependences:
+; MEMORY:      RAW dependences:
 ; MEMORY-NEXT:          [p] -> { Stmt_S1[i0] -> Stmt_S2[-p + i0] : i0 >= p and 0 <= i0 <= 99 and i0 <= 9 + p }
 ; MEMORY-NEXT:      WAR dependences:
 ; MEMORY-NEXT:          [p] -> {  }
diff --git a/polly/test/ForwardOpTree/atax.ll b/polly/test/ForwardOpTree/atax.ll
index 0690c1b000fa..496e8315b068 100644
--- a/polly/test/ForwardOpTree/atax.ll
+++ b/polly/test/ForwardOpTree/atax.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ForwardOpTree/changed-kind.ll b/polly/test/ForwardOpTree/changed-kind.ll
index a1d59825b3b2..b9081f373404 100644
--- a/polly/test/ForwardOpTree/changed-kind.ll
+++ b/polly/test/ForwardOpTree/changed-kind.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 
 ; In the code below, %0 is known to be equal to the content of @c (constant 0).
 ; Thus, in order to save a scalar dependency, forward-optree replaces
diff --git a/polly/test/ForwardOpTree/forward_from_region.ll b/polly/test/ForwardOpTree/forward_from_region.ll
index 53d22800081e..767a580dccf9 100644
--- a/polly/test/ForwardOpTree/forward_from_region.ll
+++ b/polly/test/ForwardOpTree/forward_from_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move instructions from region statements.
 ;
diff --git a/polly/test/ForwardOpTree/forward_hoisted.ll b/polly/test/ForwardOpTree/forward_hoisted.ll
index 32fca00141dd..5d0b0a884b76 100644
--- a/polly/test/ForwardOpTree/forward_hoisted.ll
+++ b/polly/test/ForwardOpTree/forward_hoisted.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting=true -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move %val to %bodyB, so %bodyA can be removed (by -polly-simplify).
 ; This involves making the load-hoisted %val1 to be made available in %bodyB.
diff --git a/polly/test/ForwardOpTree/forward_instruction.ll b/polly/test/ForwardOpTree/forward_instruction.ll
index 1dcd64357324..50a9b07b8a05 100644
--- a/polly/test/ForwardOpTree/forward_instruction.ll
+++ b/polly/test/ForwardOpTree/forward_instruction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move %val to %bodyB, so %bodyA can be removed (by -polly-simplify)
 ;
diff --git a/polly/test/ForwardOpTree/forward_into_region.ll b/polly/test/ForwardOpTree/forward_into_region.ll
index dd18cfe5e61a..ef71b11dc571 100644
--- a/polly/test/ForwardOpTree/forward_into_region.ll
+++ b/polly/test/ForwardOpTree/forward_into_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move instructions to region statements.
 ;
diff --git a/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll b/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll
index e5458c027880..1c585446ae63 100644
--- a/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll
+++ b/polly/test/ForwardOpTree/forward_into_region_redundant_use.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting=true -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 
 define void @foo(ptr %A, i32 %p, ptr %B) {
diff --git a/polly/test/ForwardOpTree/forward_load.ll b/polly/test/ForwardOpTree/forward_load.ll
index 86e3cb0203fa..0bba41833fb1 100644
--- a/polly/test/ForwardOpTree/forward_load.ll
+++ b/polly/test/ForwardOpTree/forward_load.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-optree>)" -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load.
diff --git a/polly/test/ForwardOpTree/forward_load_differentarray.ll b/polly/test/ForwardOpTree/forward_load_differentarray.ll
index 786277bdeb87..364bf3ef3713 100644
--- a/polly/test/ForwardOpTree/forward_load_differentarray.ll
+++ b/polly/test/ForwardOpTree/forward_load_differentarray.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; To forward %val, B[j] cannot be reused in bodyC because it is overwritten
 ; between. Verify that instead the alternative C[j] is used.
diff --git a/polly/test/ForwardOpTree/forward_load_double_write.ll b/polly/test/ForwardOpTree/forward_load_double_write.ll
index 1618722381fc..4c30c7f8da56 100644
--- a/polly/test/ForwardOpTree/forward_load_double_write.ll
+++ b/polly/test/ForwardOpTree/forward_load_double_write.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load even in case two writes of identical values are in
 ; one scop statement.
diff --git a/polly/test/ForwardOpTree/forward_load_fromloop.ll b/polly/test/ForwardOpTree/forward_load_fromloop.ll
index 8f08a1356c38..1494e872a894 100644
--- a/polly/test/ForwardOpTree/forward_load_fromloop.ll
+++ b/polly/test/ForwardOpTree/forward_load_fromloop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Forward a the LoadInst %val into %bodyB. %val is executed multiple times,
 ; we must get the last loaded values.
diff --git a/polly/test/ForwardOpTree/forward_load_indirect.ll b/polly/test/ForwardOpTree/forward_load_indirect.ll
index f83af61e6741..51ce94d26727 100644
--- a/polly/test/ForwardOpTree/forward_load_indirect.ll
+++ b/polly/test/ForwardOpTree/forward_load_indirect.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Forward an operand tree consisting of a speculatable instruction (%add)
 ; and a load (%val).
diff --git a/polly/test/ForwardOpTree/forward_load_memset_after.ll b/polly/test/ForwardOpTree/forward_load_memset_after.ll
index 13797a44c862..bd2cad411ecc 100644
--- a/polly/test/ForwardOpTree/forward_load_memset_after.ll
+++ b/polly/test/ForwardOpTree/forward_load_memset_after.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load in the presence of a non-store WRITE access.
 ;
diff --git a/polly/test/ForwardOpTree/forward_load_memset_before.ll b/polly/test/ForwardOpTree/forward_load_memset_before.ll
index 60b1e076b980..3e89dea37775 100644
--- a/polly/test/ForwardOpTree/forward_load_memset_before.ll
+++ b/polly/test/ForwardOpTree/forward_load_memset_before.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load in the presence of a non-store WRITE access.
 ;
diff --git a/polly/test/ForwardOpTree/forward_load_tripleuse.ll b/polly/test/ForwardOpTree/forward_load_tripleuse.ll
index 1d0df2a22e87..7526a8313945 100644
--- a/polly/test/ForwardOpTree/forward_load_tripleuse.ll
+++ b/polly/test/ForwardOpTree/forward_load_tripleuse.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-optree -polly-codegen -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-optree>,polly-codegen' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; %val1 is used three times: Twice by its own operand tree of %val2 and once
 ; more by the store in %bodyB.
diff --git a/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll b/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll
index b7bae5628986..daf289d8b0da 100644
--- a/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll
+++ b/polly/test/ForwardOpTree/forward_load_unrelatedunusual.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load.
 ; The non-analyzable store to C[0] is unrelated and can be ignored.
diff --git a/polly/test/ForwardOpTree/forward_phi_load.ll b/polly/test/ForwardOpTree/forward_phi_load.ll
index 0b0bb209a3ef..1457aa96e2de 100644
--- a/polly/test/ForwardOpTree/forward_phi_load.ll
+++ b/polly/test/ForwardOpTree/forward_phi_load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-optree-normalize-phi=true -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load.
 ;
diff --git a/polly/test/ForwardOpTree/forward_readonly.ll b/polly/test/ForwardOpTree/forward_readonly.ll
index a29c5bff5d70..646121c4efef 100644
--- a/polly/test/ForwardOpTree/forward_readonly.ll
+++ b/polly/test/ForwardOpTree/forward_readonly.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-analyze-read-only-scalars=true  -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,MODEL
-; RUN: opt %loadPolly -polly-analyze-read-only-scalars=false -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,NOMODEL
+; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=true  '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,MODEL
+; RUN: opt %loadNPMPolly -polly-analyze-read-only-scalars=false '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines -check-prefixes=STATS,NOMODEL
 ;
 ; Move %val to %bodyB, so %bodyA can be removed (by -polly-simplify)
 ;
diff --git a/polly/test/ForwardOpTree/forward_reusue.ll b/polly/test/ForwardOpTree/forward_reusue.ll
index ead8c7379803..d8ad31782ecb 100644
--- a/polly/test/ForwardOpTree/forward_reusue.ll
+++ b/polly/test/ForwardOpTree/forward_reusue.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move operand tree without duplicating values used multiple times.
 ;
diff --git a/polly/test/ForwardOpTree/forward_store.ll b/polly/test/ForwardOpTree/forward_store.ll
index a6369eb303c1..17cb8b395eb3 100644
--- a/polly/test/ForwardOpTree/forward_store.ll
+++ b/polly/test/ForwardOpTree/forward_store.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Rematerialize a load.
 ;
diff --git a/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll b/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll
index f0da9320c43f..57b68180bb12 100644
--- a/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll
+++ b/polly/test/ForwardOpTree/forward_synthesizable_definloop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Copy %val to bodyB, assuming the exit value of %i.
 ;
diff --git a/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll b/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll
index a38ab543e255..b4828e4c2c42 100644
--- a/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll
+++ b/polly/test/ForwardOpTree/forward_synthesizable_indvar.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Test support for (synthesizable) inducation variables.
 ;
diff --git a/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll b/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll
index bb1760ae0ffb..3228bb60d2ca 100644
--- a/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll
+++ b/polly/test/ForwardOpTree/forward_synthesizable_useinloop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Synthesizable values defined outside of a loop can be used
 ; inside the loop.
diff --git a/polly/test/ForwardOpTree/forward_transitive.ll b/polly/test/ForwardOpTree/forward_transitive.ll
index 243889437149..aacf1358648f 100644
--- a/polly/test/ForwardOpTree/forward_transitive.ll
+++ b/polly/test/ForwardOpTree/forward_transitive.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Move %v and %val to %bodyB, so %bodyA can be removed (by -polly-simplify)
 ;
diff --git a/polly/test/ForwardOpTree/jacobi-1d.ll b/polly/test/ForwardOpTree/jacobi-1d.ll
index 05ccd998c1a2..c9c71a15a426 100644
--- a/polly/test/ForwardOpTree/jacobi-1d.ll
+++ b/polly/test/ForwardOpTree/jacobi-1d.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ForwardOpTree/noforward_from_region.ll b/polly/test/ForwardOpTree/noforward_from_region.ll
index 30150912f32e..bd5864c25f54 100644
--- a/polly/test/ForwardOpTree/noforward_from_region.ll
+++ b/polly/test/ForwardOpTree/noforward_from_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Ensure we do not move instructions from region statements in case the
 ; instruction to move loads from an array which is also written to from
diff --git a/polly/test/ForwardOpTree/noforward_load_conditional.ll b/polly/test/ForwardOpTree/noforward_load_conditional.ll
index eaa0fc52186b..5474e740de80 100644
--- a/polly/test/ForwardOpTree/noforward_load_conditional.ll
+++ b/polly/test/ForwardOpTree/noforward_load_conditional.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; B[j] is overwritten by at least one statement between the
 ; definition of %val and its use. Hence, it cannot be forwarded.
diff --git a/polly/test/ForwardOpTree/noforward_load_writebetween.ll b/polly/test/ForwardOpTree/noforward_load_writebetween.ll
index e2272c1c1f13..697c940be4fd 100644
--- a/polly/test/ForwardOpTree/noforward_load_writebetween.ll
+++ b/polly/test/ForwardOpTree/noforward_load_writebetween.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Cannot rematerialize %val from B[0] at bodyC because B[0] has been
 ; overwritten in bodyB.
diff --git a/polly/test/ForwardOpTree/noforward_outofquota.ll b/polly/test/ForwardOpTree/noforward_outofquota.ll
index 2ec965d71184..306bb8d7558d 100644
--- a/polly/test/ForwardOpTree/noforward_outofquota.ll
+++ b/polly/test/ForwardOpTree/noforward_outofquota.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-optree-max-ops=1 -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
-; RUN: opt %loadPolly -polly-optree-max-ops=1 -polly-optree -disable-output -stats < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=STATS
+; RUN: opt %loadNPMPolly -polly-optree-max-ops=1 '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-optree-max-ops=1 -passes=polly-optree -disable-output -stats < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=STATS
 ; REQUIRES: asserts
 ;
 ; for (int j = 0; j < n; j += 1) {
diff --git a/polly/test/ForwardOpTree/noforward_partial.ll b/polly/test/ForwardOpTree/noforward_partial.ll
index 127ac9ff5f14..edb5d34801cc 100644
--- a/polly/test/ForwardOpTree/noforward_partial.ll
+++ b/polly/test/ForwardOpTree/noforward_partial.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Not the entire operand tree can be forwarded,
 ; some scalar dependencies would remain.
diff --git a/polly/test/ForwardOpTree/noforward_phi.ll b/polly/test/ForwardOpTree/noforward_phi.ll
index 58d41a410d3b..755abad4336e 100644
--- a/polly/test/ForwardOpTree/noforward_phi.ll
+++ b/polly/test/ForwardOpTree/noforward_phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Do not move PHI nodes.
 ;
diff --git a/polly/test/ForwardOpTree/noforward_selfrefphi.ll b/polly/test/ForwardOpTree/noforward_selfrefphi.ll
index b2d4dc51c978..be7e82f72633 100644
--- a/polly/test/ForwardOpTree/noforward_selfrefphi.ll
+++ b/polly/test/ForwardOpTree/noforward_selfrefphi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-optree-normalize-phi=true -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-optree-normalize-phi=true '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Contains a self-referencing PHINode that would require a
 ; transitive closure to handle.
diff --git a/polly/test/ForwardOpTree/noforward_sideffects.ll b/polly/test/ForwardOpTree/noforward_sideffects.ll
index a5633769f670..c01b72a1c142 100644
--- a/polly/test/ForwardOpTree/noforward_sideffects.ll
+++ b/polly/test/ForwardOpTree/noforward_sideffects.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly  -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly  '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Do not forward instructions with side-effects (here: function call).
 ;
diff --git a/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll b/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll
index f589fde6e415..776d848072a2 100644
--- a/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll
+++ b/polly/test/ForwardOpTree/noforward_synthesizable_unknownit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Do not try to forward %i.trunc, it is not synthesizable in %body.
 ;
diff --git a/polly/test/ForwardOpTree/out-of-quota1.ll b/polly/test/ForwardOpTree/out-of-quota1.ll
index 7afdb8e60244..ee3e32698dd0 100644
--- a/polly/test/ForwardOpTree/out-of-quota1.ll
+++ b/polly/test/ForwardOpTree/out-of-quota1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-optree -disable-output %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-optree>' -disable-output %s | FileCheck %s
 
 ; This used to loop infinitely because of UINT_MAX returned by ISL on out-of-quota.
 
diff --git a/polly/test/IstAstInfo/alias_checks_with_empty_context.ll b/polly/test/IstAstInfo/alias_checks_with_empty_context.ll
index 9b95cd5b4bbd..81c29536010b 100644
--- a/polly/test/IstAstInfo/alias_checks_with_empty_context.ll
+++ b/polly/test/IstAstInfo/alias_checks_with_empty_context.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s \
 ; RUN:     | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/IstAstInfo/alias_simple_1.ll b/polly/test/IstAstInfo/alias_simple_1.ll
index 83d470c2d19b..904f55dc32ce 100644
--- a/polly/test/IstAstInfo/alias_simple_1.ll
+++ b/polly/test/IstAstInfo/alias_simple_1.ll
@@ -1,8 +1,8 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -disable-output < %s | FileCheck %s --check-prefix=NOAA
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=BASI
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
 ;
 ;    int A[1024];
 ;
diff --git a/polly/test/IstAstInfo/alias_simple_2.ll b/polly/test/IstAstInfo/alias_simple_2.ll
index bbf528f93b47..5fae579995b2 100644
--- a/polly/test/IstAstInfo/alias_simple_2.ll
+++ b/polly/test/IstAstInfo/alias_simple_2.ll
@@ -1,9 +1,9 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -disable-output < %s | FileCheck %s --check-prefix=NOAA
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=BASI
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -globals-aa -polly-allow-nonaffine -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=globals-aa -polly-allow-nonaffine -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
 ;
 ;    int A[1024], B[1024];
 ;
diff --git a/polly/test/IstAstInfo/alias_simple_3.ll b/polly/test/IstAstInfo/alias_simple_3.ll
index 9067521323ab..8599c2993474 100644
--- a/polly/test/IstAstInfo/alias_simple_3.ll
+++ b/polly/test/IstAstInfo/alias_simple_3.ll
@@ -1,8 +1,8 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -disable-output < %s | FileCheck %s --check-prefix=NOAA
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=BASI
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
-; RUN: opt %loadPolly -polly-print-ast -disable-basic-aa -globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline= -disable-output < %s | FileCheck %s --check-prefix=NOAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=basic-aa -disable-output < %s | FileCheck %s --check-prefix=BASI
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=scev-aa -disable-output < %s | FileCheck %s --check-prefix=SCEV
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>'  -aa-pipeline=globals-aa -disable-output < %s | FileCheck %s --check-prefix=GLOB
 ;
 ;    int A[1024];
 ;    float B[1024];
diff --git a/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll b/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll
index 0cabd20168ba..dc21dc1f96a4 100644
--- a/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll
+++ b/polly/test/IstAstInfo/aliasing_arrays_with_identical_base.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s \
 ; RUN:   -polly-invariant-load-hoisting \
 ; RUN:   | FileCheck %s
 
diff --git a/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll b/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll
index b824c211fd31..8d4adfa405f0 100644
--- a/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll
+++ b/polly/test/IstAstInfo/aliasing_multiple_alias_groups.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-ast       -disable-output < %s | FileCheck %s --check-prefix=NOAA
-; RUN: opt %loadPolly -polly-print-ast -tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=      -disable-output < %s | FileCheck %s --check-prefix=NOAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -aa-pipeline=tbaa -disable-output < %s | FileCheck %s --check-prefix=TBAA
 ;
 ;    void jd(int *Int0, int *Int1, float *Float0, float *Float1) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll b/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll
index e0c3255dd766..be37b27b6e37 100644
--- a/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll
+++ b/polly/test/IstAstInfo/aliasing_parametric_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output %s | FileCheck %s
 ;
 ;    void jd(int *A, int *B, int c) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll b/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll
index 74bad6c75784..15550583340d 100644
--- a/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll
+++ b/polly/test/IstAstInfo/aliasing_parametric_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ;    void jd(int *A, int *B, int c) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/IstAstInfo/dependence_distance_minimal.ll b/polly/test/IstAstInfo/dependence_distance_minimal.ll
index c6b1d156e55d..d69cc3f9fc3f 100644
--- a/polly/test/IstAstInfo/dependence_distance_minimal.ll
+++ b/polly/test/IstAstInfo/dependence_distance_minimal.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; The minimal dependence distance of the innermost loop should be 1 instead of 250.
 ; CHECK:    #pragma minimal dependence distance: 1
diff --git a/polly/test/IstAstInfo/domain_bounded_only_with_context.ll b/polly/test/IstAstInfo/domain_bounded_only_with_context.ll
index 32cebd7a3a8b..e2cf0bd9c0df 100644
--- a/polly/test/IstAstInfo/domain_bounded_only_with_context.ll
+++ b/polly/test/IstAstInfo/domain_bounded_only_with_context.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ; CHECK:      {
 ; CHECK-NEXT:    if (p <= -1 || p >= 1)
diff --git a/polly/test/IstAstInfo/non_affine_access.ll b/polly/test/IstAstInfo/non_affine_access.ll
index d8757b2e21cf..98e8d2db959f 100644
--- a/polly/test/IstAstInfo/non_affine_access.ll
+++ b/polly/test/IstAstInfo/non_affine_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-ast-print-accesses -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-print-accesses -polly-allow-nonaffine -disable-output < %s | FileCheck %s
 ;
 ;    void non_affine_access(float A[]) {
 ;      for (long i = 0; i < 1024; i++)
diff --git a/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll b/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll
index 8d52e345a76d..c20a7d6db13c 100644
--- a/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll
+++ b/polly/test/IstAstInfo/reduction_clauses_onedimensional_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK: #pragma known-parallel reduction (^ : MemRef_sum)
 ;        void f(int N, int M, int *sum) {
diff --git a/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll b/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll
index 9c6eea6aaa1e..e6092f0b068f 100644
--- a/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll
+++ b/polly/test/IstAstInfo/reduction_dependences_equal_non_reduction_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; This loopnest contains a reduction which imposes the same dependences as the
 ; accesses to the array A. We need to ensure we do __not__ parallelize anything
diff --git a/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll b/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll
index 5104f716d810..14de70f9357c 100644
--- a/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll
+++ b/polly/test/IstAstInfo/reduction_different_reduction_clauses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK: #pragma simd reduction (+ : MemRef_sum{{[1,2]}}, MemRef_sum{{[1,2]}}) reduction (* : MemRef_prod) reduction (| : MemRef_or) reduction (& : MemRef_and)
 ; CHECK: #pragma known-parallel reduction (+ : MemRef_sum{{[1,2]}}, MemRef_sum{{[1,2]}}) reduction (* : MemRef_prod) reduction (| : MemRef_or) reduction (& : MemRef_and)
diff --git a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll
index 8a42cf8bd165..15fca884c2b6 100644
--- a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK:          #pragma known-parallel reduction (+ : MemRef_A)
 ; CHECK-NEXT:     for (int c0 = 0; c0 <= 2; c0 += 1) {
diff --git a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll
index 8f5efd165546..44e9aa4d1e56 100644
--- a/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_and_loop_reversal_schedule_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK:    #pragma known-parallel reduction
 ; CHECK:    for (int c0 = 0; c0 <= 2; c0 += 1) {
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll
index a711a36a367f..266753555cab 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK:    #pragma known-parallel
 ; CHECK:    for (int c0 = 0; c0 <= 1; c0 += 1)
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll
index 485d6965b6d3..46b2559c6e0b 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; Verify that the outer dimension doesnt't carry reduction dependences
 ;
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll
index 375fabbf6a8b..6f40ee90fef5 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; Verify that the outer dimension doesnt't carry reduction dependences
 ;
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll
index 584c076dcff4..f82b9569a88b 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; Verify that the outer dimension doesnt't carry reduction dependences
 ;
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll
index eaa3444a04d7..b889db4819cd 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; Verify that only the outer dimension needs privatization
 ;
diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions.ll
index 9618ec872c38..2a8fd7a4f670 100644
--- a/polly/test/IstAstInfo/reduction_multiple_dimensions.ll
+++ b/polly/test/IstAstInfo/reduction_multiple_dimensions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK-NOT:#pragma known-parallel reduction
 ; CHECK:    #pragma known-parallel
diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll
index af317570eb37..25f2fa597e34 100644
--- a/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll
+++ b/polly/test/IstAstInfo/reduction_multiple_dimensions_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK-NOT:#pragma known-parallel reduction
 ; CHECK:    #pragma known-parallel
diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll
index 1f7191433bf8..0d6be9a9da9b 100644
--- a/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll
+++ b/polly/test/IstAstInfo/reduction_multiple_dimensions_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK-NOT:#pragma known-parallel reduction
 ; CHECK:    #pragma known-parallel
diff --git a/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll b/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll
index 40bae5e9ac6c..8b537513cc8d 100644
--- a/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll
+++ b/polly/test/IstAstInfo/reduction_multiple_dimensions_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
 ; CHECK-NOT:#pragma known-parallel reduction
 ; CHECK:    #pragma known-parallel
diff --git a/polly/test/IstAstInfo/run-time-condition.ll b/polly/test/IstAstInfo/run-time-condition.ll
index ccc9c7cfd321..44d3534f651c 100644
--- a/polly/test/IstAstInfo/run-time-condition.ll
+++ b/polly/test/IstAstInfo/run-time-condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ; for (i = 0; i < 1024; i++)
 ;   A[i] = B[i];
diff --git a/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll b/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll
index 2853e0acf9b8..8c3f230cb413 100644
--- a/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll
+++ b/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
 ;
 ; Verify we do not simplify the runtime check to "true" due to the domain
 ; constraints as the test contains an error block that influenced the domains
diff --git a/polly/test/IstAstInfo/simple-run-time-condition.ll b/polly/test/IstAstInfo/simple-run-time-condition.ll
index 5fb99f0676b7..488cd180b899 100644
--- a/polly/test/IstAstInfo/simple-run-time-condition.ll
+++ b/polly/test/IstAstInfo/simple-run-time-condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-precise-inbounds -polly-precise-fold-accesses -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-precise-inbounds -polly-precise-fold-accesses -disable-output < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
diff --git a/polly/test/IstAstInfo/single_loop_strip_mine.ll b/polly/test/IstAstInfo/single_loop_strip_mine.ll
index 1c627f817b0b..afe6179188c0 100644
--- a/polly/test/IstAstInfo/single_loop_strip_mine.ll
+++ b/polly/test/IstAstInfo/single_loop_strip_mine.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -basic-aa -polly-import-jscop -polly-ast-print-accesses -polly-ast-detect-parallel -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=CHECK-VECTOR
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-ast-print-accesses -polly-ast-detect-parallel '-passes=polly-import-jscop,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=CHECK-VECTOR
 
 ; for (i = 0; i < 1024; i++)
 ;   A[i] = B[i];
diff --git a/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll b/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll
index f1cd5dae11ce..f614f90fc3fc 100644
--- a/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll
+++ b/polly/test/IstAstInfo/single_loop_uint_max_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 ; XFAIL: *
 
 ;#include "limits.h"
diff --git a/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll b/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll
index d421e221240a..e91ea1327869 100644
--- a/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll
+++ b/polly/test/IstAstInfo/single_loop_ull_max_iterations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s | FileCheck %s
 ; XFAIL: *
 
 ;#include "limits.h"
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll
index d4a1a6222518..49a962592bb9 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Bad-relation.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: expecting other token
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll
index 43f9d3eda049..749b962b260f 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-No-accesses-key.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: Statement from JScop file has no key name 'accesses' for index 1.
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll
index 24ad03741216..1d97e3ebca62 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-MemAcc.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: The number of memory accesses in the JSop file and the number of memory accesses differ for index 0.
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll
index 1060926e7fac..f4b739398f9f 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Not-enough-statements.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: The number of indices and the number of statements differ.
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll
index 07975976c38b..1f5cda3518a2 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Relation-mispelled.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: Memory access number 0 has no key name 'relation' for statement number 1.
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll
index 9f7259633811..0c750849b51e 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Statements-mispelled.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: JScop file has no key name 'statements'.
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll
index df7eb42da85f..d8c9c3f4ab2e 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Undeclared-ScopArrayInfo.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: JScop file contains access function with undeclared ScopArrayInfo
 ;
diff --git a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll
index 61c1173db2e7..f8d7cb8c1453 100644
--- a/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll
+++ b/polly/test/JSONExporter/ImportAccesses/ImportAccesses-Wrong-number-dimensions.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: JScop file changes the number of parameter dimensions.
 ;
diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll
index a14ae5c4d1bc..6e13a5e413d7 100644
--- a/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll
+++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-Mispelled-type.ll
@@ -1,4 +1,4 @@
-  ; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
+  ; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Array has not a valid type.
 ;
diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll
index 2a03197f1c1b..7f6578776e0b 100644
--- a/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll
+++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-Negative-size.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; #define Ni 1056
 ; #define Nj 1056
diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll
index 45bb3495de08..e698bdc488c2 100644
--- a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll
+++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-name.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Array has no key 'name'.
 ;
diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll
index 5bbb974346ba..f130b6556e3e 100644
--- a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll
+++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-sizes-key.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Array has no key 'sizes'.
 ;
diff --git a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll
index af013992fca0..68d2e50c6730 100644
--- a/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll
+++ b/polly/test/JSONExporter/ImportArrays/ImportArrays-No-type-key.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Array has no key 'type'.
 ;
diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll
index 2490e44ec347..94c77dc2a013 100644
--- a/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll
+++ b/polly/test/JSONExporter/ImportContext/ImportContext-Context-mispelled.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: JScop file has no key named 'context'.
 ;
diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll
index 66ce6a6ed922..c20d5c02d662 100644
--- a/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll
+++ b/polly/test/JSONExporter/ImportContext/ImportContext-Not-parameter-set.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: The isl_set is not a parameter set.
 ;
diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll
index 7bcc54dde52e..92f4d61212e9 100644
--- a/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll
+++ b/polly/test/JSONExporter/ImportContext/ImportContext-Unvalid-Context.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: unexpected isl_token
 ;
diff --git a/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll b/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll
index 65cdcbdcdef6..89668d8d573b 100644
--- a/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll
+++ b/polly/test/JSONExporter/ImportContext/ImportContext-Wrong-dimension.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: Imported context has the wrong number of parameters : Found 2 Expected 1
 ;
diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll
index b52db0876cc5..efe15c14ce90 100644
--- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll
+++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-No-schedule-key.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: Statement 0 has no 'schedule' key.
 ;
diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll
index 5ce3ad267bb0..db516f6d7d33 100644
--- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll
+++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Schedule-not-valid.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: expecting other token
 ;
diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll
index 4329653899b2..b93c984d7d9d 100644
--- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll
+++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Statements-mispelled.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: JScop file has no key name 'statements'.
 ;
diff --git a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll
index f66fc6c1e5d7..3fa14c64cd63 100644
--- a/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll
+++ b/polly/test/JSONExporter/ImportSchedule/ImportSchedule-Wrong-number-statements.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash opt %loadPolly -polly-import-jscop -polly-ast -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
+; RUN: not --crash opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s 2>&1 >/dev/null | FileCheck %s
 ;
 ; CHECK: The number of indices and the number of statements differ.
 ;
diff --git a/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll b/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll
index 791210f7710d..1d81ff7ef2dc 100644
--- a/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll
+++ b/polly/test/MaximalStaticExpansion/load_after_store_same_statement.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-mse -polly-print-scops -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-mse -polly-print-scops -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1| FileCheck %s --check-prefix=MSE
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that the expansion of an array with load after store in a same statement is not done.
diff --git a/polly/test/MaximalStaticExpansion/read_from_original.ll b/polly/test/MaximalStaticExpansion/read_from_original.ll
index 59f9379516c7..57017381c661 100644
--- a/polly/test/MaximalStaticExpansion/read_from_original.ll
+++ b/polly/test/MaximalStaticExpansion/read_from_original.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadPolly -polly-mse -polly-print-scops -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-mse -polly-print-scops -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1| FileCheck %s --check-prefix=MSE
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that Polly detects problems and does not expand the array
diff --git a/polly/test/MaximalStaticExpansion/too_many_writes.ll b/polly/test/MaximalStaticExpansion/too_many_writes.ll
index 50a66cd11d0a..7e33de17a174 100644
--- a/polly/test/MaximalStaticExpansion/too_many_writes.ll
+++ b/polly/test/MaximalStaticExpansion/too_many_writes.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadPolly -polly-mse -polly-print-scops -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-mse -polly-print-scops -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that Polly detects problems and does not expand the array
diff --git a/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll b/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll
index 8e2707cfee64..355fc02600d5 100644
--- a/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll
+++ b/polly/test/MaximalStaticExpansion/working_deps_between_inners.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-mse -polly-print-scops -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::Array
diff --git a/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll b/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll
index 2bf49b89db05..930539547cc9 100644
--- a/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll
+++ b/polly/test/MaximalStaticExpansion/working_deps_between_inners_phi.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadPolly -polly-mse -polly-print-scops -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-mse -polly-print-scops -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::Array and MemoryKind::PHI.
diff --git a/polly/test/MaximalStaticExpansion/working_expansion.ll b/polly/test/MaximalStaticExpansion/working_expansion.ll
index bb5b2360143f..a055e50225e9 100644
--- a/polly/test/MaximalStaticExpansion/working_expansion.ll
+++ b/polly/test/MaximalStaticExpansion/working_expansion.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-mse -polly-print-scops -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::Array
diff --git a/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll b/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll
index 89ff7890fc7e..77338c9aac20 100644
--- a/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll
+++ b/polly/test/MaximalStaticExpansion/working_expansion_multiple_dependences_per_statement.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-mse -polly-print-scops -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
 ;
 ; Verify that the accesses are correctly expanded
diff --git a/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll b/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll
index 7ffd39f0f534..9cfa5536072b 100644
--- a/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll
+++ b/polly/test/MaximalStaticExpansion/working_expansion_multiple_instruction_per_statement.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-mse -polly-print-scops -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
 ;
 ; Verify that the accesses are correctly expanded
diff --git a/polly/test/MaximalStaticExpansion/working_phi_expansion.ll b/polly/test/MaximalStaticExpansion/working_phi_expansion.ll
index 43919c61b045..63e4d4804627 100644
--- a/polly/test/MaximalStaticExpansion/working_phi_expansion.ll
+++ b/polly/test/MaximalStaticExpansion/working_phi_expansion.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadPolly -polly-mse -polly-print-scops -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-mse -polly-print-scops -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::PHI
diff --git a/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll b/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll
index a581a389e742..87bd57abab8d 100644
--- a/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll
+++ b/polly/test/MaximalStaticExpansion/working_phi_two_scalars.ll
@@ -1,6 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-mse -polly-print-scops -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-mse -polly-print-scops -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-mse>)" -pass-remarks-analysis="polly-mse" -disable-output < %s 2>&1 | FileCheck %s --check-prefix=MSE
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::PHI
diff --git a/polly/test/MaximalStaticExpansion/working_value_expansion.ll b/polly/test/MaximalStaticExpansion/working_value_expansion.ll
index d54eff9e03ec..cc28a78c3867 100644
--- a/polly/test/MaximalStaticExpansion/working_value_expansion.ll
+++ b/polly/test/MaximalStaticExpansion/working_value_expansion.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-mse -polly-print-scops -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-mse>)" -disable-output < %s | FileCheck %s
 ;
 ; Verify that the accesses are correctly expanded for MemoryKind::Value
diff --git a/polly/test/PruneUnprofitable/prune_only_scalardeps.ll b/polly/test/PruneUnprofitable/prune_only_scalardeps.ll
index 31db5560c051..9cc2aecf002d 100644
--- a/polly/test/PruneUnprofitable/prune_only_scalardeps.ll
+++ b/polly/test/PruneUnprofitable/prune_only_scalardeps.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false -polly-prune-unprofitable -disable-output -stats < %s 2>&1 | FileCheck -match-full-lines %s
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false "-passes=scop(polly-prune-unprofitable)" -disable-output -stats < %s 2>&1 | FileCheck -match-full-lines %s
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll b/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll
index 5acc35343ac3..38facb1688c4 100644
--- a/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll
+++ b/polly/test/ScheduleOptimizer/2012-03-16-Empty-Domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -S < %s
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -S < %s
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
 
 define void @sdbout_label() nounwind {
diff --git a/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll b/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll
index 3f4237b330b2..835986049899 100644
--- a/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll
+++ b/polly/test/ScheduleOptimizer/2013-04-11-Empty-Domain-two.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -S < %s
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -S < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Check that we handle statements with an empty iteration domain correctly.
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll
index a61af2d092f3..5e4ce8225a23 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-double.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
 
 define void @func(i32 %n, ptr  noalias nonnull %A,  ptr  noalias nonnull %B) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll
index 185d5c5b8c25..de4c387a1d87 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-first.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
-; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT
 
 define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll
index f1eca0ede061..91bd549c3c7e 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-except-third.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
-; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK
 
 define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll
index 35903ced7741..8b69d9e12c0f 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-carried.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
-; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,OPT
 
 define void @func(i32 %n, ptr noalias nonnull %A) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll
index 1fb8c001069f..49d112474034 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner-third.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
-; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK,RAW
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefixes=CHECK
 
 define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll
index 2db6833fa897..a449a2fda9ba 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-inner.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
 
 define void @func(i32 %n, ptr noalias nonnull %A) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll
index 49d008ba2cfa..798e9b9a7c14 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/fuse-simple.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
 
 define void @func(i32 %n, ptr noalias nonnull %A) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll
index 175b85997ec0..4d0ccc988a5c 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-simple.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
 
 ; This could theoretically be fused by adjusting the offset of the second loop by %k (instead of relying on schedule dimensions).
 
diff --git a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll
index 48ba20347d55..bf470b91a702 100644
--- a/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll
+++ b/polly/test/ScheduleOptimizer/GreedyFuse/nofuse-with-middle.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-reschedule=1 -polly-loopfusion-greedy=1 -polly-postopts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
 
 define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B, i32 %k) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll b/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll
index 537721f8718a..b0f75dd50ef8 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/disable_nonforced.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-opt-isl -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Check that the disable_nonforced metadata is honored; optimization
 ; heuristics/rescheduling must not be applied.
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll
index aaf4d27f4c5e..900360d7533f 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_heuristic.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-pragma-based-opts=1 -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=ON
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-pragma-based-opts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=OFF
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=ON
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -polly-pragma-based-opts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines --check-prefix=OFF
 ;
 define void @func(i32 %n, ptr noalias nonnull %A, ptr noalias nonnull %B) {
 entry:
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll
index b1e94227c9a5..d45b62433dbb 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_looploc.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines
 ;
 ; CHECK: warning: distribute_illegal.c:2:3: not applying loop fission/distribution: cannot ensure semantic equivalence due to possible dependency violations
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll
index fc0df85b1346..d835e66693fb 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/distribute_illegal_pragmaloc.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-reschedule=0 -polly-pragma-based-opts=1 -disable-output < %s 2>&1 | FileCheck %s --match-full-lines
 ;
 ; CHECK: warning: distribute_illegal.c:1:42: not applying loop fission/distribution: cannot ensure semantic equivalence due to possible dependency violations
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll
index 9537f3a9b0a8..a5781a7f6036 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_disable.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-pragma-based-opts=1 -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
 ;
 ; Override unroll metadata with llvm.loop.unroll.disable.
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll
index b0310970f8d6..cccf136a1c4a 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_double.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
 ;
 ; Apply two loop transformations. First partial, then full unrolling.
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll
index b9a4c845477c..4d499078a436 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_full.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
 ;
 ; Full unroll of a loop with 5 iterations.
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll
index 0387aecd683b..d67472ab8693 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_heuristic.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-pragma-based-opts=1 -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
-; RUN: opt %loadPolly -polly-pragma-based-opts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly -polly-pragma-based-opts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
 ;
 ; Unrolling with heuristic factor.
 ; Currently not supported and expected to be handled by LLVM's unroll pass.
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll
index 81e40f0a98bb..90101b4fde39 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-pragma-based-opts=1 -polly-print-opt-isl -disable-output < %s | FileCheck %s --match-full-lines
-; RUN: opt %loadPolly -polly-pragma-based-opts=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefix=OFF --match-full-lines
+; RUN: opt %loadNPMPolly -polly-pragma-based-opts=1 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --match-full-lines
+; RUN: opt %loadNPMPolly -polly-pragma-based-opts=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefix=OFF --match-full-lines
 ;
 ; Partial unroll by a factor of 4.
 ;
diff --git a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll
index 8665f68b99c1..4cfa3fb91151 100644
--- a/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll
+++ b/polly/test/ScheduleOptimizer/ManualOptimization/unroll_partial_followup.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-print-opt-isl -disable-output < %s | FileCheck %s --check-prefix=OPT --match-full-lines
-; RUN: opt %loadPolly -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST --match-full-lines
-; RUN: opt %loadPolly -polly-opt-isl -polly-codegen -simplifycfg -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s --check-prefix=OPT --match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=AST --match-full-lines
+; RUN: opt %loadNPMPolly '-passes=scop(polly-opt-isl,polly-codegen),simplifycfg' -S < %s | FileCheck %s --check-prefix=CODEGEN
 ;
 ; Partial unroll by a factor of 4.
 ;
@@ -49,7 +49,7 @@ return:
 ; OPT-NEXT:        - filter: "[n] -> { Stmt_body[i0] : (1 + i0) mod 4 = 0 }"
 
 
-; AST-LABEL: Printing analysis 'Polly - Generate an AST of the SCoP (isl)'for => return' in function 'func':
+; AST-LABEL: :: isl ast :: func :: %for---%return
 ; AST:       // Loop with Metadata
 ; AST-NEXT:  for (int c0 = 0; c0 < n; c0 += 4) {
 
diff --git a/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll b/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll
index 8585634e10ff..3f6f50e34775 100644
--- a/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll
+++ b/polly/test/ScheduleOptimizer/SIMDInParallelFor.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-parallel -polly-vectorizer=stripmine -polly-codegen-verify -polly-opt-isl -polly-print-ast -polly-codegen -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-parallel -polly-vectorizer=stripmine -passes=polly-codegen-verify '-passes=polly-opt-isl,print<polly-ast>,polly-codegen' -disable-output < %s | FileCheck %s
 ;
 ; Check that there are no nested #pragma omp parallel for inside a
 ; #pragma omp parallel for loop.
diff --git a/polly/test/ScheduleOptimizer/computeout.ll b/polly/test/ScheduleOptimizer/computeout.ll
index 35e3416f91d1..a3286b481ffb 100644
--- a/polly/test/ScheduleOptimizer/computeout.ll
+++ b/polly/test/ScheduleOptimizer/computeout.ll
@@ -1,6 +1,4 @@
-; RUN: opt -S %loadPolly -basic-aa -polly-opt-isl -polly-isl-arg=--no-schedule-serialize-sccs -polly-print-ast -disable-output < %s | FileCheck %s
 ; RUN: opt -S %loadNPMPolly "-passes=scop(polly-opt-isl,print<polly-ast>)" -polly-isl-arg=--no-schedule-serialize-sccs -disable-output < %s | FileCheck %s
-; RUN: opt -S %loadPolly -basic-aa -polly-opt-isl -polly-isl-arg=--schedule-serialize-sccs -polly-dependences-computeout=1 -polly-print-ast -disable-output  < %s | FileCheck %s -check-prefix=TIMEOUT
 ; RUN: opt -S %loadNPMPolly "-passes=scop(polly-opt-isl,print<polly-ast>)" -polly-isl-arg=--no-schedule-serialize-sccs -polly-dependences-computeout=1 -disable-output < %s | FileCheck %s -check-prefix=TIMEOUT
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll b/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll
index 43caca5372ad..928ee858ae6d 100644
--- a/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll
+++ b/polly/test/ScheduleOptimizer/ensure-correct-tile-sizes.ll
@@ -1,9 +1,9 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-remarks-minimal \
-; RUN:     -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -polly-process-unprofitable -polly-remarks-minimal \
+; RUN:     '-passes=polly-opt-isl,print<polly-ast>' -polly-pattern-matching-based-opts=true \
 ; RUN:     -polly-target-throughput-vector-fma=1 \
 ; RUN:     -polly-target-latency-vector-fma=1 \
 ; RUN:     -polly-target-vector-register-bitwidth=4096 \
-; RUN:     -polly-target-1st-cache-level-associativity=3 -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN:     -polly-target-1st-cache-level-associativity=3 -disable-output < %s | FileCheck %s
 ;
 ;     /* Test that Polly does not crash due to configurations that can lead to
 ;    incorrect tile size computations.
diff --git a/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll b/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll
index daa1afdd0aa8..b533cb870bdc 100644
--- a/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll
+++ b/polly/test/ScheduleOptimizer/focaltech_test_detail_threshold-7bc17e.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-vectorizer=stripmine -polly-invariant-load-hoisting -polly-optimized-scops -polly-print-opt-isl -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-opt-isl>)" -polly-vectorizer=stripmine -polly-invariant-load-hoisting -disable-output < %s | FileCheck %s
 ;
 ; llvm.org/PR46578
diff --git a/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll b/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll
index 06e86d7da1c6..3dd579ed736f 100644
--- a/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll
+++ b/polly/test/ScheduleOptimizer/full_partial_tile_separation.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S %loadPolly -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ; CHECK:          // 1st level tiling - Tiles
 ; CHECK-NEXT:    #pragma known-parallel
 ; CHECK-NEXT:    for (int c0 = 0; c0 <= floord(ni - 1, 32); c0 += 1)
diff --git a/polly/test/ScheduleOptimizer/line-tiling-2.ll b/polly/test/ScheduleOptimizer/line-tiling-2.ll
index eb374cb07cf3..3a2c566d19d3 100644
--- a/polly/test/ScheduleOptimizer/line-tiling-2.ll
+++ b/polly/test/ScheduleOptimizer/line-tiling-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-tile-sizes=1,64 -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-tile-sizes=1,64 '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ; CHECK: for (int c0 = 0; c0 <= 1023; c0 += 1)
 ; CHECK:   for (int c1 = 0; c1 <= 7; c1 += 1)
diff --git a/polly/test/ScheduleOptimizer/line-tiling.ll b/polly/test/ScheduleOptimizer/line-tiling.ll
index 2f14ac1d02a5..0dbdeff4742b 100644
--- a/polly/test/ScheduleOptimizer/line-tiling.ll
+++ b/polly/test/ScheduleOptimizer/line-tiling.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-tile-sizes=64,1 -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-tile-sizes=64,1 '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 
 ; CHECK: for (int c0 = 0; c0 <= 15; c0 += 1)
 ; CHECK:   for (int c1 = 0; c1 <= 511; c1 += 1)
diff --git a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll
index faf51e097a70..8f270b94617f 100644
--- a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll
+++ b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-associativity=8 \
diff --git a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll
index 30b693a2e241..de1c815f9235 100644
--- a/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll
+++ b/polly/test/ScheduleOptimizer/mat_mul_pattern_data_layout_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-associativity=8 \
@@ -6,7 +6,7 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
-; RUN: -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ;    /* C := alpha*A*B + beta*C */
 ;    /* _PB_NK % Kc != 0 */
@@ -18,7 +18,7 @@
 ;	     C[i][j] += alpha * A[i][k] * B[k][j];
 ;        }
 ;
-; CHECK-LABEL: Printing analysis 'Polly - Generate an AST from the SCoP (isl)' for region: 'bb8 => bb32' in function 'kernel_gemm':
+; CHECK-LABEL:     :: isl ast :: kernel_gemm :: %bb8---%bb32
 ; CHECK:    {
 ; CHECK-NEXT:      // 1st level tiling - Tiles
 ; CHECK-NEXT:      for (int c0 = 0; c0 <= 32; c0 += 1)
diff --git a/polly/test/ScheduleOptimizer/one-dimensional-band.ll b/polly/test/ScheduleOptimizer/one-dimensional-band.ll
index 4592907a44ad..a097d4a43cfd 100644
--- a/polly/test/ScheduleOptimizer/one-dimensional-band.ll
+++ b/polly/test/ScheduleOptimizer/one-dimensional-band.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ;    void jacobi1d(long T, long N, float *A, float *B) {
 ;      long t, i, j;
diff --git a/polly/test/ScheduleOptimizer/outer_coincidence.ll b/polly/test/ScheduleOptimizer/outer_coincidence.ll
index 2ab33edda86b..7c1af80c9ffa 100644
--- a/polly/test/ScheduleOptimizer/outer_coincidence.ll
+++ b/polly/test/ScheduleOptimizer/outer_coincidence.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=no  -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=yes -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=OUTER
+; RUN: opt %loadNPMPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=no  '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-tiling=0 -polly-parallel -polly-opt-outer-coincidence=yes '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=OUTER
 
 ; By skewing, the diagonal can be made parallel. ISL does this when the Check
 ; the 'outer_coincidence' option is enabled.
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
index 66011168fcc1..8228a5c08f59 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly \
+; RUN: opt %loadNPMPolly \
 ; RUN: -polly-pattern-matching-based-opts=true \
-; RUN: -polly-optree -polly-delicm -polly-simplify \
-; RUN: -polly-opt-isl -polly-tc-opt=true -debug -disable-output < %s 2>&1 \
+; RUN: '-passes=polly-optree,polly-delicm,polly-simplify,polly-opt-isl' \
+; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 \
 ; RUN: | FileCheck %s
 ; REQUIRES: asserts
 
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll
index 95da89f90755..4bda7584f596 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-delicm -polly-simplify -polly-opt-isl \
+; RUN: opt %loadNPMPolly '-passes=polly-delicm,polly-simplify,polly-opt-isl' \
 ; RUN: -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
index 7604257f98e0..09118e252233 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
@@ -1,8 +1,8 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=false \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=false \
 ; RUN: -debug -polly-tc-opt -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -debug -polly-tc-opt -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -polly-ast-detect-parallel -polly-print-ast -disable-output  < %s | FileCheck %s --check-prefix=PARALLEL-AST
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true -stats -disable-output < %s 2>&1 | FileCheck %s --check-prefix=STATS -match-full-lines
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true -debug -polly-tc-opt -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
+; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -polly-pattern-matching-based-opts=true -polly-ast-detect-parallel -disable-output  < %s | FileCheck %s --check-prefix=PARALLEL-AST
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true -stats -disable-output < %s 2>&1 | FileCheck %s --check-prefix=STATS -match-full-lines
 ; REQUIRES: asserts
 ;
 ;    /* C := alpha*A*B + beta*C */
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
index ccdb39b60d75..b771d1f87537 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_11.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop \
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-opt-isl' \
 ; RUN: -polly-import-jscop-postfix=transformed \
 ; RUN: -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
@@ -8,7 +8,7 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -debug \
+; RUN: -debug \
 ; RUN: -polly-tc-opt=true -disable-output < %s 2>&1 \
 ; RUN: | FileCheck %s
 ; REQUIRES: asserts
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll
index dd39fec5e21f..238f6dd798e6 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_12.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-associativity=8 \
@@ -6,7 +6,7 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -disable-output < %s
+; RUN: -passes=polly-opt-isl -disable-output < %s
 ;
 ; Test whether isolation works as expected.
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll
index e086dd36c4d9..0e4540eb7ba3 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_13.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=2 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-associativity=8 \
@@ -6,7 +6,7 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=128 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ; Test whether isolation works as expected.
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll
index a4c71c2dace5..9678ad83ff04 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_14.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-opt-isl  \
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,polly-opt-isl,polly-codegen'  \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-associativity=8 \
@@ -6,7 +6,7 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-import-jscop-postfix=transformed -polly-codegen -S < %s \
+; RUN: -polly-import-jscop-postfix=transformed -S < %s \
 ; RUN: | FileCheck %s
 ;
 ; Check that we disable the Loop Vectorizer.
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll
index a8da21955b63..e74884d59c31 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_15.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -debug-only=polly-opt-isl -disable-output \
 ; RUN: -polly-tc-opt=true < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll
index c1ad3017a0d4..9c99a090b69e 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_16.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll
index 002816a4ae80..8e14035ce862 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_17.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll
index d5679c7ae2f7..4f562c306f96 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_18.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll
index 4e1620abd252..32ded897d4ff 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_19.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
index 01e336ebc60f..f0c0177da84b 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll
index 0be08d8d493c..155177bdfade 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_20.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll
index 9b2df49698a1..3d21ac3859a7 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_21.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll
index 3d3641df5098..00a4bf885aef 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_22.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll
index 895961488014..bfe5c5249a3a 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_24.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-reschedule=0 -polly-opt-isl \
+; RUN: opt %loadNPMPolly -polly-reschedule=0 -passes=polly-opt-isl \
 ; RUN: -polly-pattern-matching-based-opts=true -polly-tc-opt=true \
 ; RUN: -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll
index 8a3957909d9d..a2e1ced3e632 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_25.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-tc-opt=true -debug -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
@@ -53,4 +53,4 @@ for.body8:                                        ; preds = %for.body8, %for.con
   br i1 %exitcond.not, label %for.cond.cleanup7, label %for.body8
 }
 
-declare double @llvm.fmuladd.f64(double, double, double)
-\ No newline at end of file
+declare double @llvm.fmuladd.f64(double, double, double)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
index fab3ac5e58dc..9844d377e609 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_3.ll
@@ -1,11 +1,11 @@
-; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-size=0 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
-; RUN: -polly-opt-isl -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s
 
-; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-associativity=8 \
@@ -13,7 +13,7 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -polly-print-ast -disable-output < %s 2>&1 | FileCheck %s --check-prefix=EXTRACTION-OF-MACRO-KERNEL
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=EXTRACTION-OF-MACRO-KERNEL
 ;
 ;    /* C := alpha*A*B + beta*C */
 ;    for (i = 0; i < _PB_NI; i++)
@@ -24,7 +24,7 @@
 ;	     C[i][j] += alpha * A[i][k] * B[k][j];
 ;        }
 ;
-; CHECK-LABEL: Printing analysis 'Polly - Generate an AST from the SCoP (isl)' for region: 'bb8 => bb32' in function 'kernel_gemm':
+; CHECK-LABEL: :: isl ast :: kernel_gemm :: %bb8---%bb32 
 ; CHECK:    {
 ; CHECK-NEXT:      // 1st level tiling - Tiles
 ; CHECK-NEXT:      for (int c0 = 0; c0 <= 32; c0 += 1)
@@ -76,7 +76,7 @@
 ; CHECK-NEXT:          }
 ; CHECK-NEXT:    }
 ;
-; EXTRACTION-OF-MACRO-KERNEL-LABEL: Printing analysis 'Polly - Generate an AST from the SCoP (isl)' for region: 'bb8 => bb32' in function 'kernel_gemm':
+; EXTRACTION-OF-MACRO-KERNEL-LABEL: :: isl ast :: kernel_gemm :: %bb8---%bb32 
 ; EXTRACTION-OF-MACRO-KERNEL:    {
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:      // 1st level tiling - Tiles
 ; EXTRACTION-OF-MACRO-KERNEL-NEXT:      for (int c0 = 0; c0 <= 32; c0 += 1)
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
index dc0edc6c5a3b..250641d57bac 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_4.ll
@@ -1,12 +1,12 @@
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; RUN: -debug -polly-tc-opt=true -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly '-passes=polly-opt-isl,print<polly-ast>' -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
-; RUN: -polly-target-2nd-cache-level-size=262144 -polly-print-ast \
-; RUN: -polly-tc-opt=true -disable-output -polly-opt-isl < %s |  \
+; RUN: -polly-target-2nd-cache-level-size=262144 \
+; RUN: -polly-tc-opt=true -disable-output < %s |  \
 ; RUN: FileCheck %s --check-prefix=PATTERN-MATCHING-OPTS
 ; REQUIRES: asserts
 ;
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll
index 6581566bf13f..ad2c195ba1e8 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-associativity=8 \
@@ -6,12 +6,12 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
-; opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+; opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ; -polly-target-throughput-vector-fma=1 \
 ; -polly-target-latency-vector-fma=8 \
-; -polly-codegen -polly-target-1st-cache-level-associativity=8 \
+; -passes=polly-codegen -polly-target-1st-cache-level-associativity=8 \
 ; -polly-target-2nd-cache-level-associativity=8 \
 ; -polly-target-1st-cache-level-size=32768 \
 ; -polly-target-vector-register-bitwidth=256 \
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll
index bcf1fc9fe813..1d3cdbdbfdd8 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_6.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-associativity=8 \
@@ -6,12 +6,12 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
-;  opt %loadPolly -polly-opt-isl -polly-pattern-matching-based-opts=true \
+;  opt %loadNPMPolly -passes=polly-opt-isl -polly-pattern-matching-based-opts=true \
 ;  -polly-target-throughput-vector-fma=1 \
 ;  -polly-target-latency-vector-fma=8 \
-;  -polly-codegen -polly-target-1st-cache-level-associativity=8 \
+;  -passes=polly-codegen -polly-target-1st-cache-level-associativity=8 \
 ;  -polly-target-2nd-cache-level-associativity=8 \
 ;  -polly-target-1st-cache-level-size=32768 \
 ;  -polly-target-vector-register-bitwidth=256 \
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll
index 77a3e02a0063..59eaa4a0928e 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_7.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-associativity=8 \
@@ -6,7 +6,7 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ;    /* C := A * B + C */
 ;    /* Elements of the matrices A, B, C have the float type. */
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll
index d02bc359e79d..2544d502a2dc 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_8.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-associativity=8 \
@@ -6,7 +6,7 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ;    /* C := A * B + C */
 ;    /* Elements of the matrices B, C have the double type. */
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll
index 144abfd7622f..85c143562f5a 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts_9.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-pattern-matching-based-opts=true \
+; RUN: opt %loadNPMPolly -polly-pattern-matching-based-opts=true \
 ; RUN: -polly-target-throughput-vector-fma=1 \
 ; RUN: -polly-target-latency-vector-fma=8 \
 ; RUN: -polly-target-1st-cache-level-associativity=8 \
@@ -6,9 +6,9 @@
 ; RUN: -polly-target-1st-cache-level-size=32768 \
 ; RUN: -polly-target-vector-register-bitwidth=256 \
 ; RUN: -polly-target-2nd-cache-level-size=262144 \
-; RUN: -polly-opt-isl -disable-output < %s
+; RUN: -passes=polly-opt-isl -disable-output < %s
 ;
-; RUN: opt %loadPolly -polly-print-dependences -disable-output < %s | FileCheck %s --check-prefix=DEPENDENCES
+; RUN: opt %loadNPMPolly '-passes=print<polly-dependences>' -disable-output < %s | FileCheck %s --check-prefix=DEPENDENCES
 ;
 ;    /* C := A * B + C */
 ;    /* Elements of the matrices A, B, C have the char type. */
diff --git a/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll b/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll
index 5b9783d20bfc..64285891a16c 100644
--- a/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll
+++ b/polly/test/ScheduleOptimizer/pattern_matching_based_opts_splitmap.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-opt-isl -debug-only=polly-opt-isl -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-import-jscop -polly-import-jscop-postfix=transformed -passes=polly-opt-isl -debug-only=polly-opt-isl -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ; void pattern_matching_based_opts_splitmap(double C[static const restrict 2][2], double A[static const restrict 2][784], double B[static const restrict 784][2]) {
diff --git a/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll b/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
index fea2155b1e4e..a18ba1daef84 100644
--- a/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
+++ b/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S %loadPolly -basic-aa -polly-tiling=false -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-tiling=false -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 @C = common global [1536 x [1536 x float]] zeroinitializer, align 16
diff --git a/polly/test/ScheduleOptimizer/prevectorization.ll b/polly/test/ScheduleOptimizer/prevectorization.ll
index 385ebf14712a..4db61ad032ea 100644
--- a/polly/test/ScheduleOptimizer/prevectorization.ll
+++ b/polly/test/ScheduleOptimizer/prevectorization.ll
@@ -1,5 +1,5 @@
-; RUN: opt -S %loadPolly -basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine                         -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt -S %loadPolly -basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine -polly-prevect-width=16 -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s -check-prefix=VEC16
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine                         '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt -S %loadNPMPolly -aa-pipeline=basic-aa -polly-pattern-matching-based-opts=false -polly-vectorizer=stripmine -polly-prevect-width=16 '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s -check-prefix=VEC16
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScheduleOptimizer/rectangular-tiling.ll b/polly/test/ScheduleOptimizer/rectangular-tiling.ll
index b527255ab5f7..e1d768b351d7 100644
--- a/polly/test/ScheduleOptimizer/rectangular-tiling.ll
+++ b/polly/test/ScheduleOptimizer/rectangular-tiling.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-tile-sizes=256,16                                                                                                                                        -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-tile-sizes=256,16 -polly-tiling=false                                                                                                                    -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=NOTILING
-; RUN: opt %loadPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8                                                                               -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=TWOLEVEL
-; RUN: opt %loadPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8 -polly-register-tiling                                                        -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=TWO-PLUS-REGISTER
+; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16                                                                                                                                        '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-tiling=false                                                                                                                    '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=NOTILING
+; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8                                                                               '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=TWOLEVEL
+; RUN: opt %loadNPMPolly -polly-tile-sizes=256,16 -polly-2nd-level-tiling -polly-2nd-level-tile-sizes=16,8 -polly-register-tiling                                                        '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s --check-prefix=TWO-PLUS-REGISTER
 
 ; CHECK: // 1st level tiling - Tiles
 ; CHECK: for (int c0 = 0; c0 <= 3; c0 += 1)
diff --git a/polly/test/ScheduleOptimizer/schedule_computeout.ll b/polly/test/ScheduleOptimizer/schedule_computeout.ll
index acc8601a31a8..1e1359e3ecc6 100644
--- a/polly/test/ScheduleOptimizer/schedule_computeout.ll
+++ b/polly/test/ScheduleOptimizer/schedule_computeout.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -S -polly-optree -polly-delicm  -polly-opt-isl -polly-schedule-computeout=10000 -debug-only="polly-opt-isl" < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-optree -passes=polly-delicm  -passes=polly-opt-isl -polly-schedule-computeout=10000 -debug-only="polly-opt-isl" < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 ; Bailout if the computations of schedule compute exceeds the max scheduling quota.
diff --git a/polly/test/ScheduleOptimizer/statistics.ll b/polly/test/ScheduleOptimizer/statistics.ll
index 472febea173f..84eb59341d27 100644
--- a/polly/test/ScheduleOptimizer/statistics.ll
+++ b/polly/test/ScheduleOptimizer/statistics.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-opt-isl -stats -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -passes=polly-opt-isl -stats -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 
 ; REQUIRES: asserts
 
diff --git a/polly/test/ScheduleOptimizer/tile_after_fusion.ll b/polly/test/ScheduleOptimizer/tile_after_fusion.ll
index 8e5849234af6..50a46d66176e 100644
--- a/polly/test/ScheduleOptimizer/tile_after_fusion.ll
+++ b/polly/test/ScheduleOptimizer/tile_after_fusion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-isl-arg=--no-schedule-serialize-sccs -polly-opt-isl -polly-print-ast -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-isl-arg=--no-schedule-serialize-sccs '-passes=polly-opt-isl,print<polly-ast>' -disable-output < %s | FileCheck %s
 ;
 ;
 ;    void tf(int C[256][256][256], int A0[256][256][256], int A1[256][256][256]) {
@@ -17,7 +17,7 @@
 ; checks whether they are tiled after being fused when polly-opt-fusion equals
 ; "max".
 ;
-; CHECK-LABEL: Printing analysis 'Polly - Generate an AST from the SCoP (isl)' for region: 'for.cond => for.end56' in function 'tf':
+; CHECK-LABEL: :: isl ast :: tf :: %for.cond---%for.end56 
 ; CHECK:       1st level tiling - Tiles
 ; CHECK-NEXT:     for (int c0 = 0; c0 <= 7; c0 += 1)
 ; CHECK-NEXT:       for (int c1 = 0; c1 <= 7; c1 += 1)
diff --git a/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll b/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll
index d08595db8fce..e59a31665d77 100644
--- a/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll
+++ b/polly/test/ScheduleOptimizer/vivid-vbi-gen-vivid_vbi_gen_sliced-before-llvmreduced.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-vectorizer=stripmine -polly-isl-arg=--no-schedule-serialize-sccs -polly-tiling=0 -polly-print-opt-isl -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-vectorizer=stripmine -polly-isl-arg=--no-schedule-serialize-sccs -polly-tiling=0 '-passes=print<polly-opt-isl>' -disable-output < %s | FileCheck %s
 
 ; isl_schedule_node_band_sink may sink into multiple children.
 ; https://llvm.org/PR52637
diff --git a/polly/test/ScopDetect/aliasing_parametric_simple_1.ll b/polly/test/ScopDetect/aliasing_parametric_simple_1.ll
index 2eddbd4cb262..cee1c06cf7aa 100644
--- a/polly/test/ScopDetect/aliasing_parametric_simple_1.ll
+++ b/polly/test/ScopDetect/aliasing_parametric_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop:
 ;
diff --git a/polly/test/ScopDetect/aliasing_parametric_simple_2.ll b/polly/test/ScopDetect/aliasing_parametric_simple_2.ll
index c111f686c462..5506b3c626cf 100644
--- a/polly/test/ScopDetect/aliasing_parametric_simple_2.ll
+++ b/polly/test/ScopDetect/aliasing_parametric_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop:
 ;
diff --git a/polly/test/ScopDetect/aliasing_simple_1.ll b/polly/test/ScopDetect/aliasing_simple_1.ll
index 524ca19ae398..5f43ec1856a7 100644
--- a/polly/test/ScopDetect/aliasing_simple_1.ll
+++ b/polly/test/ScopDetect/aliasing_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop:
 ;
diff --git a/polly/test/ScopDetect/aliasing_simple_2.ll b/polly/test/ScopDetect/aliasing_simple_2.ll
index 457df996c7b8..e853dfcc6448 100644
--- a/polly/test/ScopDetect/aliasing_simple_2.ll
+++ b/polly/test/ScopDetect/aliasing_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop:
 ;
diff --git a/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll b/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll
index 0411aed6ae04..eeb9e11f812c 100644
--- a/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll
+++ b/polly/test/ScopDetect/base_pointer_load_setNewAccessRelation.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true -polly-scops -polly-print-import-jscop -polly-codegen -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>,scop(polly-import-jscop,polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; This violated an assertion in setNewAccessRelation that assumed base pointers
 ; to be load-hoisted. Without this assertion, it codegen would generate invalid
diff --git a/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll b/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll
index ff9be6ea16e8..16976e631327 100644
--- a/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll
+++ b/polly/test/ScopDetect/base_pointer_setNewAccessRelation.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -disable-basic-aa -polly-detect -polly-print-import-jscop -polly-codegen -disable-output < %s | FileCheck %s --allow-empty
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(polly-import-jscop,polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s --allow-empty
 ;
 ; Polly codegen used to generate invalid code (referring to %ptr from the
 ; original region) when regeneration of the access function is necessary.
diff --git a/polly/test/ScopDetect/callbr.ll b/polly/test/ScopDetect/callbr.ll
index d65ab934bf2e..418297469367 100644
--- a/polly/test/ScopDetect/callbr.ll
+++ b/polly/test/ScopDetect/callbr.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-detect -polly-detect-track-failures -disable-output -pass-remarks-missed=polly-detect < %s 2>&1 | FileCheck %s --check-prefix=REMARK
-; RUN: opt %loadPolly -polly-detect -polly-detect-track-failures -disable-output -stats                            < %s 2>&1 | FileCheck %s --check-prefix=STAT
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -polly-detect-track-failures -disable-output -pass-remarks-missed=polly-detect < %s 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -polly-detect-track-failures -disable-output -stats                            < %s 2>&1 | FileCheck %s --check-prefix=STAT
 ; REQUIRES: asserts
 
 ; REMARK: Branch from indirect terminator.
diff --git a/polly/test/ScopDetect/collective_invariant_loads.ll b/polly/test/ScopDetect/collective_invariant_loads.ll
index f1d2eea520c6..f451bccec706 100644
--- a/polly/test/ScopDetect/collective_invariant_loads.ll
+++ b/polly/test/ScopDetect/collective_invariant_loads.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting -disable-output< %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting -disable-output< %s 2>&1 | FileCheck %s
 
 ;CHECK:     Function: test_init_chpl
 ;CHECK-NEXT:     Region: %bb1---%bb16
diff --git a/polly/test/ScopDetect/cross_loop_non_single_exit.ll b/polly/test/ScopDetect/cross_loop_non_single_exit.ll
index ae23930b92a6..fe3922174c07 100644
--- a/polly/test/ScopDetect/cross_loop_non_single_exit.ll
+++ b/polly/test/ScopDetect/cross_loop_non_single_exit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll b/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll
index 5c25da66d7ef..4cac173932a6 100644
--- a/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll
+++ b/polly/test/ScopDetect/cross_loop_non_single_exit_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll b/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll
index 12983d2321cc..7d7476471bb6 100644
--- a/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll
+++ b/polly/test/ScopDetect/dependency_to_phi_node_outside_of_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-detect -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
 define void @f(ptr %A, i64 %N, i64 %M) nounwind {
diff --git a/polly/test/ScopDetect/dot-scops-npm.ll b/polly/test/ScopDetect/dot-scops-npm.ll
index 7c8be032fd4f..d14bf8a23a16 100644
--- a/polly/test/ScopDetect/dot-scops-npm.ll
+++ b/polly/test/ScopDetect/dot-scops-npm.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly "-passes=polly-scop-printer" -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=polly-scop-printer' -disable-output < %s
 ; RUN: FileCheck %s -input-file=scops.func_npm.dot
 ;
 ; Check that the ScopPrinter does not crash.
diff --git a/polly/test/ScopDetect/dot-scops.ll b/polly/test/ScopDetect/dot-scops.ll
index c31562e4c62d..63163b23617c 100644
--- a/polly/test/ScopDetect/dot-scops.ll
+++ b/polly/test/ScopDetect/dot-scops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-scops -dot-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,polly-scop-printer' -disable-output < %s
 ;
 ; Check that the ScopPrinter does not crash.
 ; ScopPrinter needs the ScopDetection pass, which should depend on
diff --git a/polly/test/ScopDetect/error-block-always-executed.ll b/polly/test/ScopDetect/error-block-always-executed.ll
index 894be2119941..d799d575a530 100644
--- a/polly/test/ScopDetect/error-block-always-executed.ll
+++ b/polly/test/ScopDetect/error-block-always-executed.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid Region for Scop:
 
diff --git a/polly/test/ScopDetect/error-block-referenced-from-scop.ll b/polly/test/ScopDetect/error-block-referenced-from-scop.ll
index 085351482139..ba271f34ea7b 100644
--- a/polly/test/ScopDetect/error-block-referenced-from-scop.ll
+++ b/polly/test/ScopDetect/error-block-referenced-from-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid Region for Scop:
 
diff --git a/polly/test/ScopDetect/error-block-unreachable.ll b/polly/test/ScopDetect/error-block-unreachable.ll
index 48f6fe8e0547..6ba7698a972b 100644
--- a/polly/test/ScopDetect/error-block-unreachable.ll
+++ b/polly/test/ScopDetect/error-block-unreachable.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-detect -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s
 
 ; Verify that the scop detection does not crash on inputs with unreachable
 ; blocks. Earlier we crashed when detecting error blocks.
diff --git a/polly/test/ScopDetect/expand-region-correctly-2.ll b/polly/test/ScopDetect/expand-region-correctly-2.ll
index fadb503cff35..df35d05674f9 100644
--- a/polly/test/ScopDetect/expand-region-correctly-2.ll
+++ b/polly/test/ScopDetect/expand-region-correctly-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop: if.end.1631 => for.cond.1647.outer
 ;
diff --git a/polly/test/ScopDetect/expand-region-correctly.ll b/polly/test/ScopDetect/expand-region-correctly.ll
index 72082a32fa79..a8c90c08fde0 100644
--- a/polly/test/ScopDetect/expand-region-correctly.ll
+++ b/polly/test/ScopDetect/expand-region-correctly.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Valid Region for Scop: if.end.1631 => for.cond.1647.outer
 
diff --git a/polly/test/ScopDetect/ignore_func_flag_regex.ll b/polly/test/ScopDetect/ignore_func_flag_regex.ll
index 224126ec010e..a75e705995a7 100644
--- a/polly/test/ScopDetect/ignore_func_flag_regex.ll
+++ b/polly/test/ScopDetect/ignore_func_flag_regex.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-ignore-func=f.*,g.* -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-ignore-func=f.*,g.* '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the flag `-polly-ignore-func` works with regexes.
 ;
diff --git a/polly/test/ScopDetect/index_from_unpredictable_loop.ll b/polly/test/ScopDetect/index_from_unpredictable_loop.ll
index 27ed64da17e6..f6d6cfab0eed 100644
--- a/polly/test/ScopDetect/index_from_unpredictable_loop.ll
+++ b/polly/test/ScopDetect/index_from_unpredictable_loop.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly                        -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=AFFINE
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly                        '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=AFFINE
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
 
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopDetect/index_from_unpredictable_loop2.ll b/polly/test/ScopDetect/index_from_unpredictable_loop2.ll
index 9b5a3a4389d4..16d47619b0ff 100644
--- a/polly/test/ScopDetect/index_from_unpredictable_loop2.ll
+++ b/polly/test/ScopDetect/index_from_unpredictable_loop2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly                        -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=AFFINE
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly                        '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=AFFINE
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
 
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopDetect/indvars.ll b/polly/test/ScopDetect/indvars.ll
index 2ba4d1f5aabf..3fbc4d65bbe2 100644
--- a/polly/test/ScopDetect/indvars.ll
+++ b/polly/test/ScopDetect/indvars.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -polly-codegen -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,scop(polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s
 ;
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/ScopDetect/intrinsics_1.ll b/polly/test/ScopDetect/intrinsics_1.ll
index 65d3968e247c..0f9c70084a3d 100644
--- a/polly/test/ScopDetect/intrinsics_1.ll
+++ b/polly/test/ScopDetect/intrinsics_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Valid Region for Scop: for.cond => for.end
 ;
diff --git a/polly/test/ScopDetect/intrinsics_2.ll b/polly/test/ScopDetect/intrinsics_2.ll
index f0575511b2ef..1db9807cadb8 100644
--- a/polly/test/ScopDetect/intrinsics_2.ll
+++ b/polly/test/ScopDetect/intrinsics_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we allow the lifetime markers for the tmp array.
 ;
diff --git a/polly/test/ScopDetect/intrinsics_3.ll b/polly/test/ScopDetect/intrinsics_3.ll
index bce90d136a41..a230d0aa831c 100644
--- a/polly/test/ScopDetect/intrinsics_3.ll
+++ b/polly/test/ScopDetect/intrinsics_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we allow the misc intrinsics.
 ;
diff --git a/polly/test/ScopDetect/invalid-latch-conditions.ll b/polly/test/ScopDetect/invalid-latch-conditions.ll
index eb8097470ecf..db4898c9c7bd 100644
--- a/polly/test/ScopDetect/invalid-latch-conditions.ll
+++ b/polly/test/ScopDetect/invalid-latch-conditions.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly                              -polly-process-unprofitable=false -polly-print-detect -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-allow-nonaffine-loops                                   -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=NALOOPS
-; RUN: opt %loadPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly                              -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops                                   '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NALOOPS
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 
 ; The latch conditions of the outer loop are not affine, thus the loop cannot
 ; handled by the domain generation and needs to be overapproximated.
diff --git a/polly/test/ScopDetect/invalidate_scalar_evolution.ll b/polly/test/ScopDetect/invalidate_scalar_evolution.ll
index 01d34c49e289..ddef510ad4d9 100644
--- a/polly/test/ScopDetect/invalidate_scalar_evolution.ll
+++ b/polly/test/ScopDetect/invalidate_scalar_evolution.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s -check-prefix=PHI
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PHI
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/invariant-load-before-scop.ll b/polly/test/ScopDetect/invariant-load-before-scop.ll
index f72085ff88a1..10479643959c 100644
--- a/polly/test/ScopDetect/invariant-load-before-scop.ll
+++ b/polly/test/ScopDetect/invariant-load-before-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; The LoadInst %.b761 is defined outside the SCoP, hence is always constant
 ; within it. It is no "required invariant load".
diff --git a/polly/test/ScopDetect/keep_going_expansion.ll b/polly/test/ScopDetect/keep_going_expansion.ll
index 9bcfb3924f6a..074aae9ae95c 100644
--- a/polly/test/ScopDetect/keep_going_expansion.ll
+++ b/polly/test/ScopDetect/keep_going_expansion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-detect-track-failures -polly-detect-keep-going -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-detect-track-failures -polly-detect-keep-going '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopDetect/mod_ref_read_pointer.ll b/polly/test/ScopDetect/mod_ref_read_pointer.ll
index 95a4649f4705..64535d85f2ab 100644
--- a/polly/test/ScopDetect/mod_ref_read_pointer.ll
+++ b/polly/test/ScopDetect/mod_ref_read_pointer.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-allow-modref-calls -polly-print-detect -disable-output < %s | FileCheck %s -check-prefix=MODREF
-; RUN: opt %loadPolly -basic-aa                           -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=MODREF
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa                           '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid Region for Scop: for.body => for.end
 ; MODREF: Valid Region for Scop: for.body => for.end
diff --git a/polly/test/ScopDetect/more-than-one-loop.ll b/polly/test/ScopDetect/more-than-one-loop.ll
index bfd226c1bcfc..30090652326d 100644
--- a/polly/test/ScopDetect/more-than-one-loop.ll
+++ b/polly/test/ScopDetect/more-than-one-loop.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-process-unprofitable=false -polly-print-detect -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-process-unprofitable=true -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=true '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Valid Region for Scop:
 
diff --git a/polly/test/ScopDetect/multidim-with-undef-size.ll b/polly/test/ScopDetect/multidim-with-undef-size.ll
index 9973c6c72169..2a5f8b15534f 100644
--- a/polly/test/ScopDetect/multidim-with-undef-size.ll
+++ b/polly/test/ScopDetect/multidim-with-undef-size.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; CHECK: Valid Region for Scop: bb14 => bb17
diff --git a/polly/test/ScopDetect/multidim.ll b/polly/test/ScopDetect/multidim.ll
index f43698819f32..91202373263f 100644
--- a/polly/test/ScopDetect/multidim.ll
+++ b/polly/test/ScopDetect/multidim.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; CHECK: Valid Region for Scop: bb19 => bb20
diff --git a/polly/test/ScopDetect/multidim_indirect_access.ll b/polly/test/ScopDetect/multidim_indirect_access.ll
index 3e06251f5fd1..a9cd446d2767 100644
--- a/polly/test/ScopDetect/multidim_indirect_access.ll
+++ b/polly/test/ScopDetect/multidim_indirect_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we will recognize this SCoP.
 ;
diff --git a/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll b/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll
index ed554a24a6d6..9c91fbfbe0b6 100644
--- a/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll
+++ b/polly/test/ScopDetect/multidim_two_accesses_different_delinearization.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/ScopDetect/nested_loop_single_exit.ll b/polly/test/ScopDetect/nested_loop_single_exit.ll
index 377e8088eedb..a0742112b6e1 100644
--- a/polly/test/ScopDetect/nested_loop_single_exit.ll
+++ b/polly/test/ScopDetect/nested_loop_single_exit.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
 
 ; void f(long A[], long N) {
 ;   long i, j;
diff --git a/polly/test/ScopDetect/non-affine-conditional.ll b/polly/test/ScopDetect/non-affine-conditional.ll
index fc2d0c02d2da..e74619cd8775 100644
--- a/polly/test/ScopDetect/non-affine-conditional.ll
+++ b/polly/test/ScopDetect/non-affine-conditional.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/ScopDetect/non-affine-float-compare.ll b/polly/test/ScopDetect/non-affine-float-compare.ll
index 984f14aaff8f..9326cd429038 100644
--- a/polly/test/ScopDetect/non-affine-float-compare.ll
+++ b/polly/test/ScopDetect/non-affine-float-compare.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(float *A) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll
index 068367fa1e3c..1ab6b35ae93f 100644
--- a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll
+++ b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                                                          -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                                                           -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine                                   -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine -polly-process-unprofitable=false -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                                                          '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                                                           '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine                                   '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 ;
 ; Here we have a non-affine loop but also a non-affine access which should
 ; be rejected as long as -polly-allow-nonaffine isn't given.
diff --git a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll
index cd2140518b46..921f6ab53549 100644
--- a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll
+++ b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_2.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                        -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                         -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                        '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                         '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
 ;
 ; Here we have a non-affine loop (in the context of the loop nest)
 ; and also a non-affine access (A[k]). While we can always detect the
diff --git a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll
index fb936216e45c..78774d92e0a4 100644
--- a/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll
+++ b/polly/test/ScopDetect/non-affine-loop-condition-dependent-access_3.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                        -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                         -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                        '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                         '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
 ;
 ; Here we have a non-affine loop (in the context of the loop nest)
 ; and also a non-affine access (A[k]). While we can always detect the
diff --git a/polly/test/ScopDetect/non-affine-loop.ll b/polly/test/ScopDetect/non-affine-loop.ll
index d5f7ea128a79..5136b3b8779b 100644
--- a/polly/test/ScopDetect/non-affine-loop.ll
+++ b/polly/test/ScopDetect/non-affine-loop.ll
@@ -1,8 +1,8 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                                                          -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                                                           -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false -polly-allow-nonaffine                                   -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=ALLOWNONAFFINEREGIONSANDACCESSES
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine                                   -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine -polly-process-unprofitable=false -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                                                          '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                                                           '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false -polly-allow-nonaffine                                   '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINEREGIONSANDACCESSES
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine                                   '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALLOWNONAFFINELOOPSANDACCESSES
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true  -polly-allow-nonaffine -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 ;
 ; This function/region does contain a loop, however it is non-affine, hence the access
 ; A[i] is also. Furthermore, it is the only loop, thus when we over approximate
diff --git a/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll b/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll
index 43af1684dccb..fd52c5df7b27 100644
--- a/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll
+++ b/polly/test/ScopDetect/non-beneficial-loops-small-trip-count.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-process-unprofitable=false -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid
 ;
diff --git a/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll b/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll
index 4cddcc916a76..d0c1f7a61333 100644
--- a/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll
+++ b/polly/test/ScopDetect/non-constant-add-rec-start-expr.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Valid Region for Scop: bb11 => bb25
 
diff --git a/polly/test/ScopDetect/non-simple-memory-accesses.ll b/polly/test/ScopDetect/non-simple-memory-accesses.ll
index a82228982885..bdc48984f996 100644
--- a/polly/test/ScopDetect/non-simple-memory-accesses.ll
+++ b/polly/test/ScopDetect/non-simple-memory-accesses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we do not model atomic memory accesses. We did not reason about
 ; how to handle them correctly and the Alias Set Tracker models some of them
diff --git a/polly/test/ScopDetect/non_affine_loop_condition.ll b/polly/test/ScopDetect/non_affine_loop_condition.ll
index f268442cd8ee..63bd7b3a2f1f 100644
--- a/polly/test/ScopDetect/non_affine_loop_condition.ll
+++ b/polly/test/ScopDetect/non_affine_loop_condition.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-loops                                   -polly-print-detect -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false -polly-print-detect -disable-output < %s | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops                                   '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 ;
 ;    void f(int *A) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/ScopDetect/only-one-affine-loop.ll b/polly/test/ScopDetect/only-one-affine-loop.ll
index d6d50bb611d9..1d36f4df35bc 100644
--- a/polly/test/ScopDetect/only-one-affine-loop.ll
+++ b/polly/test/ScopDetect/only-one-affine-loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-process-unprofitable=false -polly-allow-nonaffine-loops -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=false -polly-allow-nonaffine-loops '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Even if we allow non-affine loops we can only model the outermost loop, all
 ; other loops are boxed in non-affine regions. However, the inner loops can be
diff --git a/polly/test/ScopDetect/only_func_flag.ll b/polly/test/ScopDetect/only_func_flag.ll
index d465cd0f50f7..4742375fec5c 100644
--- a/polly/test/ScopDetect/only_func_flag.ll
+++ b/polly/test/ScopDetect/only_func_flag.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-only-func=f,g -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-only-func=f,g '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the flag `-polly-only-func` limits analysis to `f` and `g`.
 ;
diff --git a/polly/test/ScopDetect/only_func_flag_regex.ll b/polly/test/ScopDetect/only_func_flag_regex.ll
index e6675798eeb9..2ad22c9f7a7f 100644
--- a/polly/test/ScopDetect/only_func_flag_regex.ll
+++ b/polly/test/ScopDetect/only_func_flag_regex.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-only-func=f.*,g.* -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-only-func=f.*,g.* '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the flag `-polly-only-func` works with regexes.
 ;
diff --git a/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll b/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll
index fc957a7f912c..271825a58c39 100644
--- a/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll
+++ b/polly/test/ScopDetect/parametric-multiply-in-scev-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 
 ; CHECK-NOT: Valid Region
diff --git a/polly/test/ScopDetect/parametric-multiply-in-scev.ll b/polly/test/ScopDetect/parametric-multiply-in-scev.ll
index 9c6e5ccc8f52..2ab8997c6333 100644
--- a/polly/test/ScopDetect/parametric-multiply-in-scev.ll
+++ b/polly/test/ScopDetect/parametric-multiply-in-scev.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ;  foo(float *A, long n, long k) {
 ;    if (true)
diff --git a/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll b/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll
index 054de168d76b..248bb43aacd9 100644
--- a/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll
+++ b/polly/test/ScopDetect/phi_with_multi_exiting_edges.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Region with an exit node that has a PHI node multiple incoming edges from
 ; inside the region. Motivation for supporting such cases in Polly.
diff --git a/polly/test/ScopDetect/profitability-large-basic-blocks.ll b/polly/test/ScopDetect/profitability-large-basic-blocks.ll
index e1650febf11c..d74185b45c75 100644
--- a/polly/test/ScopDetect/profitability-large-basic-blocks.ll
+++ b/polly/test/ScopDetect/profitability-large-basic-blocks.ll
@@ -1,12 +1,12 @@
-; RUN: opt %loadPolly -polly-process-unprofitable=false \
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=false \
 ; RUN:                -polly-detect-profitability-min-per-loop-insts=40 \
-; RUN: -polly-print-detect -disable-output < %s | FileCheck %s -check-prefix=PROFITABLE
+; RUN: '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFITABLE
 
-; RUN: opt %loadPolly -polly-process-unprofitable=true \
-; RUN: -polly-print-detect -disable-output < %s | FileCheck %s -check-prefix=PROFITABLE
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=true \
+; RUN: '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFITABLE
 
-; RUN: opt %loadPolly -polly-process-unprofitable=false \
-; RUN: -polly-print-detect -disable-output < %s | FileCheck %s -check-prefix=UNPROFITABLE
+; RUN: opt %loadNPMPolly -polly-process-unprofitable=false \
+; RUN: '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=UNPROFITABLE
 
 ; UNPROFITABLE-NOT: Valid Region for Scop:
 ; PROFITABLE: Valid Region for Scop:
diff --git a/polly/test/ScopDetect/profitability-two-nested-loops.ll b/polly/test/ScopDetect/profitability-two-nested-loops.ll
index 525f91cbc2f4..0291d3be452a 100644
--- a/polly/test/ScopDetect/profitability-two-nested-loops.ll
+++ b/polly/test/ScopDetect/profitability-two-nested-loops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Valid Region for Scop: next => bb3
 ;
diff --git a/polly/test/ScopDetect/remove_all_children.ll b/polly/test/ScopDetect/remove_all_children.ll
index 6d5097b80607..d95e9bde0b38 100644
--- a/polly/test/ScopDetect/remove_all_children.ll
+++ b/polly/test/ScopDetect/remove_all_children.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopDetect/report-scop-location.ll b/polly/test/ScopDetect/report-scop-location.ll
index 750699cbe763..a99a2ef2b484 100644
--- a/polly/test/ScopDetect/report-scop-location.ll
+++ b/polly/test/ScopDetect/report-scop-location.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-detect -polly-report -disable-output < %s  2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -polly-report -disable-output < %s  2>&1 | FileCheck %s
 target datalayout = "e-i64:64-f80:128-s:64-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
diff --git a/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll b/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll
index e94f1e7728c5..f49190b33ccf 100644
--- a/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll
+++ b/polly/test/ScopDetect/restrict-undef-size-scopdetect.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ; CHECK-NOT: Valid Region for Scop:
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/ScopDetect/run_time_alias_check.ll b/polly/test/ScopDetect/run_time_alias_check.ll
index 672f3dfa6365..74cbedb34e5c 100644
--- a/polly/test/ScopDetect/run_time_alias_check.ll
+++ b/polly/test/ScopDetect/run_time_alias_check.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
diff --git a/polly/test/ScopDetect/scev_remove_max.ll b/polly/test/ScopDetect/scev_remove_max.ll
index 5353e06bdf2f..caf55bf87a66 100644
--- a/polly/test/ScopDetect/scev_remove_max.ll
+++ b/polly/test/ScopDetect/scev_remove_max.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-detect < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s
 
 ; This test case helps to determine wether SCEVRemoveMax::remove produces
 ; an infinite loop and a segmentation fault, if it processes, for example,
diff --git a/polly/test/ScopDetect/sequential_loops.ll b/polly/test/ScopDetect/sequential_loops.ll
index e6ac38aa1604..4a84f356f3e8 100644
--- a/polly/test/ScopDetect/sequential_loops.ll
+++ b/polly/test/ScopDetect/sequential_loops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
@@ -13,7 +13,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; }
 
 define void @f1(ptr %A, i64 %N) nounwind {
-; CHECK-LABEL: 'Polly - Detect static control parts (SCoPs)' for function 'f1'
+; CHECK-LABEL: Detected Scops in Function f1 
 entry:
   fence seq_cst
   br label %for.i.1
@@ -60,7 +60,7 @@ return:
 ;     }
 
 define void @f2(ptr %A, i64 %N) nounwind {
-; CHECK-LABEL: 'Polly - Detect static control parts (SCoPs)' for function 'f2'
+; CHECK-LABEL: Detected Scops in Function f2
 entry:
   fence seq_cst
   br label %for.i.1
diff --git a/polly/test/ScopDetect/simple_loop.ll b/polly/test/ScopDetect/simple_loop.ll
index c8ed89a97d00..33823b21fb8f 100644
--- a/polly/test/ScopDetect/simple_loop.ll
+++ b/polly/test/ScopDetect/simple_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/simple_loop_non_single_entry.ll b/polly/test/ScopDetect/simple_loop_non_single_entry.ll
index 22adec5d2039..1bba2c21c747 100644
--- a/polly/test/ScopDetect/simple_loop_non_single_entry.ll
+++ b/polly/test/ScopDetect/simple_loop_non_single_entry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/simple_loop_non_single_exit.ll b/polly/test/ScopDetect/simple_loop_non_single_exit.ll
index 71ac830cae7d..93ec84e911c5 100644
--- a/polly/test/ScopDetect/simple_loop_non_single_exit.ll
+++ b/polly/test/ScopDetect/simple_loop_non_single_exit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll b/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll
index d9915dc130d5..33b0d8d7d6fc 100644
--- a/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll
+++ b/polly/test/ScopDetect/simple_loop_non_single_exit_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll b/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll
index 867bd50513f0..9b47b7c946ca 100644
--- a/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll
+++ b/polly/test/ScopDetect/simple_loop_two_phi_nodes.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/simple_loop_with_param.ll b/polly/test/ScopDetect/simple_loop_with_param.ll
index 1ae5c6608739..4a0a3adab661 100644
--- a/polly/test/ScopDetect/simple_loop_with_param.ll
+++ b/polly/test/ScopDetect/simple_loop_with_param.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-detect -disable-output < %s | FileCheck %s -check-prefix=PHI
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PHI
 
 ; void f(long A[], long N, long *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopDetect/simple_loop_with_param_2.ll b/polly/test/ScopDetect/simple_loop_with_param_2.ll
index 1a4750621c19..670936b6fee8 100644
--- a/polly/test/ScopDetect/simple_loop_with_param_2.ll
+++ b/polly/test/ScopDetect/simple_loop_with_param_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], int N, int *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopDetect/simple_non_single_entry.ll b/polly/test/ScopDetect/simple_non_single_entry.ll
index a1995a427903..6ace3b636019 100644
--- a/polly/test/ScopDetect/simple_non_single_entry.ll
+++ b/polly/test/ScopDetect/simple_non_single_entry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
 ;   long i;
diff --git a/polly/test/ScopDetect/skip_function_attribute.ll b/polly/test/ScopDetect/skip_function_attribute.ll
index e85dbd4c2b83..2150a3e8c35d 100644
--- a/polly/test/ScopDetect/skip_function_attribute.ll
+++ b/polly/test/ScopDetect/skip_function_attribute.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify polly skips this function
 ;
diff --git a/polly/test/ScopDetect/srem_with_parametric_divisor.ll b/polly/test/ScopDetect/srem_with_parametric_divisor.ll
index 4b5c3b04c2ce..66c3b045f62a 100644
--- a/polly/test/ScopDetect/srem_with_parametric_divisor.ll
+++ b/polly/test/ScopDetect/srem_with_parametric_divisor.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid Region for Scop:
 ;
diff --git a/polly/test/ScopDetect/statistics.ll b/polly/test/ScopDetect/statistics.ll
index 64df3d081605..a1dcebec63ff 100644
--- a/polly/test/ScopDetect/statistics.ll
+++ b/polly/test/ScopDetect/statistics.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-detect -stats -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -stats -disable-output < %s 2>&1 | FileCheck %s
 
 ; REQUIRES: asserts
 
diff --git a/polly/test/ScopDetect/switch-in-loop-patch.ll b/polly/test/ScopDetect/switch-in-loop-patch.ll
index ab4729fc09a4..2f9b670384db 100644
--- a/polly/test/ScopDetect/switch-in-loop-patch.ll
+++ b/polly/test/ScopDetect/switch-in-loop-patch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK-NOT: Valid
 
diff --git a/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll b/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll
index 97ba7f9634e9..4ae86a940e0c 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportAlias-01.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-use-runtime-alias-checks=false -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-print-detect -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly -polly-use-runtime-alias-checks=false -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s
 
 ;void f(int A[], int B[]) {
 ;  for (int i=0; i<42; i++)
diff --git a/polly/test/ScopDetectionDiagnostics/ReportEntry.ll b/polly/test/ScopDetectionDiagnostics/ReportEntry.ll
index fc21e192f32c..adb14b5b017d 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportEntry.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportEntry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-detect -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
 
 ; CHECK: remark: <unknown>:0:0: Scop contains function entry (not yet supported).
 
diff --git a/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll b/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll
index abace4ba520d..428a7cf855f6 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportFuncCall-01.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; #define N 1024
 ; double invalidCall(double A[N]);
diff --git a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll
index 8368a68b42f0..d22c3b6d27c3 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ;void foo(int a, int b) {
diff --git a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll
index 82c6c33e287c..2bc515e0ae5e 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegionWithoutDebugLoc.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-detect -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
 
 ; CHECK: remark: <unknown>:0:0: Irreducible region encountered in control flow.
 
diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
index 35986b5e0b35..cb913000a993 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
@@ -1,15 +1,15 @@
-; RUN: opt %loadPolly \
+; RUN: opt %loadNPMPolly \
 ; RUN:     -pass-remarks-missed="polly-detect" -polly-detect-track-failures \
-; RUN:     -polly-allow-nonaffine-loops=false -polly-print-detect -disable-output \
+; RUN:     -polly-allow-nonaffine-loops=false '-passes=print<polly-detect>' -disable-output \
 ; RUN:     < %s 2>&1| FileCheck %s --check-prefix=REJECTNONAFFINELOOPS
-; RUN: opt %loadPolly \
+; RUN: opt %loadNPMPolly \
 ; RUN:     -pass-remarks-missed="polly-detect" -polly-detect-track-failures \
-; RUN:     -polly-allow-nonaffine-loops=true -polly-print-detect -disable-output \
+; RUN:     -polly-allow-nonaffine-loops=true '-passes=print<polly-detect>' -disable-output \
 ; RUN:     < %s 2>&1| FileCheck %s --check-prefix=ALLOWNONAFFINELOOPS
-; RUN: opt %loadPolly -pass-remarks-missed="polly-detect" \
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" \
 ; RUN:     -polly-process-unprofitable=false \
 ; RUN:     -polly-detect-track-failures -polly-allow-nonaffine-loops=true \
-; RUN:     -polly-allow-nonaffine -polly-print-detect -disable-output < %s 2>&1 \
+; RUN:     -polly-allow-nonaffine '-passes=print<polly-detect>' -disable-output < %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=ALLOWNONAFFINEALL
 
 ; void f(int A[], int n) {
diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll
index 5dbeaded45c9..92028093f70b 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll
@@ -4,8 +4,8 @@
 ; the PostDominatorTree. Infinite loops are postdominated ony by the virtual
 ; root, which causes them not to appear in regions in ScopDetection anymore.
 
-; RUN: opt %loadPolly -pass-remarks-missed="polly-detect" -polly-allow-nonaffine-loops -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadPolly -pass-remarks-missed="polly-detect" -polly-allow-nonaffine-loops=false -polly-print-detect -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-allow-nonaffine-loops '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-allow-nonaffine-loops=false '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void func (int param0, int N, int *A)
 ; {
diff --git a/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll b/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll
index 634b63e6d44d..dd95bd6ede71 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportMultipleNonAffineAccesses.ll
@@ -1,9 +1,9 @@
-; RUN: opt %loadPolly -basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-print-detect -disable-output < %s 2>&1| FileCheck %s
-; RUN: opt %loadPolly -basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-print-detect -polly-delinearize=false -polly-detect-keep-going -disable-output < %s 2>&1| FileCheck %s -check-prefix=ALL
-; RUN: opt %loadPolly -basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-print-detect -disable-output < %s 2>&1| FileCheck %s -check-prefix=DELIN
-; RUN: opt %loadPolly -basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-print-detect -polly-detect-keep-going -disable-output < %s 2>&1| FileCheck %s -check-prefix=DELIN-ALL
-; RUN: opt %loadPolly -basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-print-detect -polly-allow-nonaffine -disable-output < %s 2>&1| FileCheck %s -check-prefix=NONAFFINE
-; RUN: opt %loadPolly -basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-print-detect -polly-allow-nonaffine -disable-output < %s 2>&1| FileCheck %s -check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-delinearize=false -polly-detect-keep-going -disable-output < %s 2>&1| FileCheck %s -check-prefix=ALL
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s -check-prefix=DELIN
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-detect-keep-going -disable-output < %s 2>&1| FileCheck %s -check-prefix=DELIN-ALL
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-allow-nonaffine -disable-output < %s 2>&1| FileCheck %s -check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -polly-allow-nonaffine -disable-output < %s 2>&1| FileCheck %s -check-prefix=NONAFFINE
 
 ;  1 void manyaccesses(float A[restrict], long n, float B[restrict][n])
 ;  2 {
diff --git a/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll b/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll
index 23d8c9c061c9..832045f089d6 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-print-detect -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s
 
 ; void f(int A[]) {
 ;   for(int i=0; i<42; ++i)
diff --git a/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll b/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll
index d35b7a28ba89..b951487d6197 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll
@@ -1,9 +1,9 @@
-; RUN: opt %loadPolly -pass-remarks-missed="polly-detect" \
-; RUN:     -polly-detect-track-failures -polly-print-detect -disable-output \
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" \
+; RUN:     -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output \
 ; RUN:     -polly-process-unprofitable=false < %s 2>&1| FileCheck %s
 
-; RUN: opt %loadPolly -pass-remarks-missed="polly-detect" \
-; RUN:     -polly-detect-track-failures -polly-print-detect -disable-output \
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" \
+; RUN:     -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output \
 ; RUN:     -polly-process-unprofitable=false < %s 2>&1 -pass-remarks-output=%t.yaml
 ; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll b/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll
index 6c868db78ce7..d110cfefc27d 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportUnreachableInExit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s \
 ; RUN:     -pass-remarks-missed="polly-detect" 2>&1 | FileCheck %s
 
 ; void f(long A[], long N) {
diff --git a/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll b/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll
index a82f56b7a5fa..c2efd6165a26 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-print-detect -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output < %s 2>&1| FileCheck %s
 
 ; struct b {
 ;   double **b;
diff --git a/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll b/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll
index a0f2704b1372..3cdeed13ec28 100644
--- a/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll
+++ b/polly/test/ScopDetectionDiagnostics/loop_has_multiple_exits.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures -polly-detect -disable-output 2>&1 < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-detect-track-failures '-passes=print<polly-detect>' -disable-output 2>&1 < %s | FileCheck %s -match-full-lines
 ;
 ; Derived from test-suite/MultiSource/Benchmarks/BitBench/uuencode/uuencode.c
 ;
diff --git a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll
index 667ed7d18ab5..4a9a200d67df 100644
--- a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll
+++ b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
 
 ; CHECK: remark: <unknown>:0:0: Loop cannot be handled because not all latches are part of loop region.
 
diff --git a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll
index 9dce56a3a3c4..61ff033d9f93 100644
--- a/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll
+++ b/polly/test/ScopDetectionDiagnostics/loop_partially_in_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -pass-remarks-missed="polly-detect" -disable-output < %s 2>&1| FileCheck %s
 
 ; CHECK: remark: <unknown>:0:0: Loop cannot be handled because not all latches are part of loop region.
 ; CHECK: remark: <unknown>:0:0: Loop cannot be handled because not all latches are part of loop region.
diff --git a/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll b/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll
index 94dd5824777c..c5efec3f50c5 100644
--- a/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll
+++ b/polly/test/ScopInfo/20110312-Fail-without-basicaa.ll
@@ -1,5 +1,5 @@
 ; This should be run without alias analysis enabled.
-;RUN: opt %loadPolly -polly-scops  -disable-output < %s
+;RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>'  -disable-output < %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
 
 define i32 @main() nounwind {
diff --git a/polly/test/ScopInfo/20111108-Parameter-not-detected.ll b/polly/test/ScopInfo/20111108-Parameter-not-detected.ll
index f80177cb90e7..81c7efb96365 100644
--- a/polly/test/ScopInfo/20111108-Parameter-not-detected.ll
+++ b/polly/test/ScopInfo/20111108-Parameter-not-detected.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 declare void @foo()
diff --git a/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll b/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll
index b55d635947e5..5abf8ff29ef8 100644
--- a/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll
+++ b/polly/test/ScopInfo/2012-03-16-Crash-because-of-unsigned-in-scev.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-a0:0-n32"
 
diff --git a/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll b/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll
index d4d931fd2e0c..d16ba453f981 100644
--- a/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll
+++ b/polly/test/ScopInfo/2015-10-04-Crash-in-domain-generation.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-loops -polly-scops -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=print<polly-function-scops>' -disable-output < %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/Alias-0.ll b/polly/test/ScopInfo/Alias-0.ll
index 0fc4ad91b7db..ebbe744627ef 100644
--- a/polly/test/ScopInfo/Alias-0.ll
+++ b/polly/test/ScopInfo/Alias-0.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
-; RUN: opt %loadPolly -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
 ; REQUIRES: asserts
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/Alias-1.ll b/polly/test/ScopInfo/Alias-1.ll
index eab8c062f4ba..b1711c25857d 100644
--- a/polly/test/ScopInfo/Alias-1.ll
+++ b/polly/test/ScopInfo/Alias-1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
-; RUN: opt %loadPolly -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
 ; REQUIRES: asserts
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/Alias-2.ll b/polly/test/ScopInfo/Alias-2.ll
index 64f1e0bc919d..b94f130c94eb 100644
--- a/polly/test/ScopInfo/Alias-2.ll
+++ b/polly/test/ScopInfo/Alias-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
-; RUN: opt %loadPolly -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
 ; REQUIRES: asserts
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/Alias-3.ll b/polly/test/ScopInfo/Alias-3.ll
index 5e9b94e692bc..af7816546b4a 100644
--- a/polly/test/ScopInfo/Alias-3.ll
+++ b/polly/test/ScopInfo/Alias-3.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
-; RUN: opt %loadPolly -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
 ; REQUIRES: asserts
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/Alias-4.ll b/polly/test/ScopInfo/Alias-4.ll
index 4d5a91abb96f..fe651c87b241 100644
--- a/polly/test/ScopInfo/Alias-4.ll
+++ b/polly/test/ScopInfo/Alias-4.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -disable-basic-aa -polly-print-scops -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
-; RUN: opt %loadPolly -disable-basic-aa -polly-print-scops -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
+; RUN: opt %loadNPMPolly -aa-pipeline= '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=RTA
+; RUN: opt %loadNPMPolly -aa-pipeline= '-passes=print<polly-detect>,print<polly-function-scops>' -polly-use-runtime-alias-checks=false -disable-output < %s -stats 2>&1 | FileCheck %s --check-prefix=NORTA
 ; REQUIRES: asserts
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/BoundChecks/single-loop.ll b/polly/test/ScopInfo/BoundChecks/single-loop.ll
index bc96c907afc9..10a0a58f381d 100644
--- a/polly/test/ScopInfo/BoundChecks/single-loop.ll
+++ b/polly/test/ScopInfo/BoundChecks/single-loop.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ; This only works after the post-dominator tree has been fixed.
 ;
diff --git a/polly/test/ScopInfo/BoundChecks/two-loops.ll b/polly/test/ScopInfo/BoundChecks/two-loops.ll
index 14e07f42a3ae..c85ac5b4ba8f 100644
--- a/polly/test/ScopInfo/BoundChecks/two-loops.ll
+++ b/polly/test/ScopInfo/BoundChecks/two-loops.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output< %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ; This only works after the post-dominator tree has fixed.
 ; XFAIL: *
diff --git a/polly/test/ScopInfo/NonAffine/div_backedge.ll b/polly/test/ScopInfo/NonAffine/div_backedge.ll
index a6aca032ef62..3b0c673ece38 100644
--- a/polly/test/ScopInfo/NonAffine/div_backedge.ll
+++ b/polly/test/ScopInfo/NonAffine/div_backedge.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(float *A) {
 ;      for (long i = 1;; i++) {
diff --git a/polly/test/ScopInfo/NonAffine/div_domain.ll b/polly/test/ScopInfo/NonAffine/div_domain.ll
index f61c4eb459ed..34a5cecdfe3d 100644
--- a/polly/test/ScopInfo/NonAffine/div_domain.ll
+++ b/polly/test/ScopInfo/NonAffine/div_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(float *A) {
 ;      for (long i = 0; i < 16; i++) {
diff --git a/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll b/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll
index f5d63dfb9d2c..7d02fae7f98f 100644
--- a/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll
+++ b/polly/test/ScopInfo/NonAffine/invariant_loads_dependent_in_non_affine_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting=true -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int *B, int *C) {
 ;      for (int i = 0; i < 1000; i++)
diff --git a/polly/test/ScopInfo/NonAffine/modulo_backedge.ll b/polly/test/ScopInfo/NonAffine/modulo_backedge.ll
index dec63ca6813d..d5c808d9021f 100644
--- a/polly/test/ScopInfo/NonAffine/modulo_backedge.ll
+++ b/polly/test/ScopInfo/NonAffine/modulo_backedge.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Domain :=
 ; CHECK:   { Stmt_for_body[i0] : 0 <= i0 <= 6 };
diff --git a/polly/test/ScopInfo/NonAffine/modulo_domain.ll b/polly/test/ScopInfo/NonAffine/modulo_domain.ll
index f5ebec2b0346..13fe53f11633 100644
--- a/polly/test/ScopInfo/NonAffine/modulo_domain.ll
+++ b/polly/test/ScopInfo/NonAffine/modulo_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; TODO: The new domain generation cannot handle modulo domain constraints,
 ;       hence modulo handling has been disabled completely. Once this is
diff --git a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll
index 837d9b21b16e..2b8427d74ec8 100644
--- a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll
+++ b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                                   -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCALAR
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-process-unprofitable=false -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                                   '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCALAR
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-process-unprofitable=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=PROFIT
 ;
 ; SCALAR:      Function: f
 ; SCALAR-NEXT: Region: %bb1---%bb13
diff --git a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll
index e39569abc52d..30f756e81e47 100644
--- a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll
+++ b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_2.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                       -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                        -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=ALL
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                       '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                        '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL
 ;
 ; Here we have a non-affine loop (in the context of the loop nest)
 ; and also a non-affine access (A[k]). While we can always model the
diff --git a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll
index 75dd7ac26bb3..6dacd719862e 100644
--- a/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll
+++ b/polly/test/ScopInfo/NonAffine/non-affine-loop-condition-dependent-access_3.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                       -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                        -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=ALL
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=false                       '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true                        '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL
 ;
 ; Here we have a non-affine loop (in the context of the loop nest)
 ; and also a non-affine access (A[k]). While we can always model the
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll b/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll
index 34b04933af86..8a13f791ed6d 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_access_with_range_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A) {
 ;      for (int i = 0; i < 128; i++)
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll b/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll
index 9955c88b2cfd..1e70d2c9db87 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_but_sdiv.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Statements {
 ; CHECK-NEXT:     Stmt_for_body
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll b/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll
index b194ee762e9f..dcfaa9280dcb 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_but_srem.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void pos(float *A, long n) {
 ;      for (long i = 0; i < 100; i++)
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll b/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll
index 1f55530b137d..24bfe6050216 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_conditional_nested.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll
index 3511362304b4..931ad36d15f3 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_affine_loop.ll
@@ -1,11 +1,11 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches \
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches \
 ; RUN:     -polly-invariant-load-hoisting=true \
 ; RUN:     -polly-allow-nonaffine-loops=true \
-; RUN:     -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadPolly -polly-allow-nonaffine \
+; RUN:     '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine \
 ; RUN:     -polly-invariant-load-hoisting=true \
 ; RUN:     -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true \
-; RUN:     -polly-print-scops -disable-output < %s | FileCheck %s \
+; RUN:     '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s \
 ; RUN:     --check-prefix=ALL
 ;
 ; Negative test for INNERMOST.
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll
index c2e1e46f6f18..37b51cebd74d 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_conditional_surrounding_non_affine_loop.ll
@@ -1,16 +1,16 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches \
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches \
 ; RUN:     -polly-invariant-load-hoisting=true \
 ; RUN:     -polly-allow-nonaffine-loops=true \
-; RUN:     -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=INNERMOST
-; RUN: opt %loadPolly -polly-allow-nonaffine \
+; RUN:     '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=INNERMOST
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine \
 ; RUN:     -polly-invariant-load-hoisting=true \
 ; RUN:     -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true \
-; RUN:     -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=ALL
-; RUN: opt %loadPolly -polly-allow-nonaffine \
+; RUN:     '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=ALL
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine \
 ; RUN:     -polly-invariant-load-hoisting=true \
 ; RUN:     -polly-process-unprofitable=false \
 ; RUN:     -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops=true \
-; RUN:     -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=PROFIT
+; RUN:     '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 ;
 ; Negative test for INNERMOST.
 ; At the moment we will optimistically assume A[i] in the conditional before the inner
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll b/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll
index c62447b6c15c..7bfd7f86efcd 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_float_compare.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(float *A) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll b/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll
index 873b44b9c8cf..fc779d544e62 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_loop_condition.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops                                   -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-process-unprofitable=false -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=PROFIT
-; RUN: opt %loadPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-detect-reductions=false    -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=NO-REDUCTION
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops                                   '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-process-unprofitable=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-detect-reductions=false    '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NO-REDUCTION
 ;
 ;    void f(int *A, int *C) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll b/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll
index 127bf80b9451..79b61eca258f 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops                                                                        -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-unprofitable-scalar-accs=true -polly-process-unprofitable=false -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=PROFIT
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops                                                                        '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-unprofitable-scalar-accs=true -polly-process-unprofitable=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 ;
 ; Verify that we over approximate the read acces of A[j] in the last statement as j is
 ; computed in a non-affine loop we do not model.
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll b/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll
index de011e29aeea..d33befe2c66e 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_parametric_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-allow-nonaffine -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, double A[], int INDEX[]) {
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll b/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll
index 7303b4ea47fd..77c2df48d651 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_region_guaranteed_non-entry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-loops -polly-detect -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -polly-detect '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll b/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll
index 4f54d03d43fb..9ed340d1d304 100644
--- a/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll
+++ b/polly/test/ScopInfo/NonAffine/whole-scop-non-affine-subregion-in-loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 ;
 ; Regression test that triggered a memory leak at some point (24947).
 ;
diff --git a/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll b/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll
index dc59fbfc66a8..cbd024ba7a39 100644
--- a/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll
+++ b/polly/test/ScopInfo/aliasing_conditional_alias_groups_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that there is no alias group because we either access A or B never both.
 ;
diff --git a/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll b/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll
index a19d60dd9147..3858d8a7bb1d 100644
--- a/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll
+++ b/polly/test/ScopInfo/aliasing_conditional_alias_groups_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we create two alias groups since the minimal/maximal accesses
 ; depend on %b.
diff --git a/polly/test/ScopInfo/aliasing_dead_access.ll b/polly/test/ScopInfo/aliasing_dead_access.ll
index 2a725cf3c855..7baa3dce1f9d 100644
--- a/polly/test/ScopInfo/aliasing_dead_access.ll
+++ b/polly/test/ScopInfo/aliasing_dead_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not create a SCoP if there is no statement executed.
 ;
diff --git a/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll b/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll
index 937d4ada3ec9..7265aab22a49 100644
--- a/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll
+++ b/polly/test/ScopInfo/aliasing_many_arrays_to_compare.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output \
-; RUN:                < %s | FileCheck %s --check-prefix=FOUND
-; RUN: opt %loadPolly -polly-print-scops -disable-output \
-; RUN:                -polly-rtc-max-arrays-per-group=3 < %s | FileCheck %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \
+; RUN:                < %s 2>&1 | FileCheck %s --check-prefix=FOUND
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \
+; RUN:                -polly-rtc-max-arrays-per-group=3 < %s 2>&1 | FileCheck %s \
 ; RUN:                --check-prefix=IGNORED
 ;
 ; FOUND: Function: foo
diff --git a/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll b/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll
index c22cfe55e118..d66a10bc511b 100644
--- a/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll
+++ b/polly/test/ScopInfo/aliasing_many_read_only_acesses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Assumed Context:
 ; CHECK-NEXT: { : }
diff --git a/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll b/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll
index 16cb3dc0f5ac..9943802ec859 100644
--- a/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll
+++ b/polly/test/ScopInfo/aliasing_multiple_alias_groups.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output          < %s | FileCheck %s --check-prefix=NOAA
-; RUN: opt %loadPolly -polly-print-scops -disable-output -tbaa    < %s | FileCheck %s --check-prefix=TBAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -aa-pipeline=         < %s 2>&1 | FileCheck %s --check-prefix=NOAA
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -aa-pipeline=tbaa    < %s 2>&1 | FileCheck %s --check-prefix=TBAA
 ;
 ;    void jd(int *Int0, int *Int1, float *Float0, float *Float1) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/ScopInfo/aliasing_with_non_affine_access.ll b/polly/test/ScopInfo/aliasing_with_non_affine_access.ll
index 056b644cd5ed..900d5d40d96f 100644
--- a/polly/test/ScopInfo/aliasing_with_non_affine_access.ll
+++ b/polly/test/ScopInfo/aliasing_with_non_affine_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-ast -polly-process-unprofitable -polly-allow-nonaffine -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -polly-process-unprofitable -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; @test1
 ; Make sure we generate the correct aliasing check for a fixed-size memset operation.
diff --git a/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll b/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll
index d170a50e26fc..cb06e352da65 100644
--- a/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll
+++ b/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll
@@ -1,14 +1,14 @@
-; RUN: opt %loadPolly -disable-output -polly-invariant-load-hoisting \
+; RUN: opt %loadNPMPolly -disable-output -polly-invariant-load-hoisting \
 ; RUN: -polly-allow-dereference-of-all-function-parameters \
-; RUN: -polly-print-scops < %s | FileCheck %s --check-prefix=SCOP
+; RUN: '-passes=print<polly-function-scops>' < %s 2>&1 | FileCheck %s --check-prefix=SCOP
 
-; RUN: opt %loadPolly -S -polly-invariant-load-hoisting \
-; RUN: -polly-codegen < %s | FileCheck %s --check-prefix=CODE-RTC
+; RUN: opt %loadNPMPolly -S -polly-invariant-load-hoisting \
+; RUN: -passes=polly-codegen < %s 2>&1 | FileCheck %s --check-prefix=CODE-RTC
 
 
-; RUN: opt %loadPolly -S -polly-invariant-load-hoisting \
+; RUN: opt %loadNPMPolly -S -polly-invariant-load-hoisting \
 ; RUN: -polly-allow-dereference-of-all-function-parameters \
-; RUN: -polly-codegen < %s | FileCheck %s --check-prefix=CODE
+; RUN: -passes=polly-codegen < %s 2>&1 | FileCheck %s --check-prefix=CODE
 
 ; SCOP:      Function: hoge
 ; SCOP-NEXT: Region: %bb15---%bb37
diff --git a/polly/test/ScopInfo/assume_gep_bounds.ll b/polly/test/ScopInfo/assume_gep_bounds.ll
index d0ce47148071..bd14e3868d52 100644
--- a/polly/test/ScopInfo/assume_gep_bounds.ll
+++ b/polly/test/ScopInfo/assume_gep_bounds.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ;    void foo(float A[][20][30], long n, long m, long p) {
 ;      for (long i = 0; i < n; i++)
diff --git a/polly/test/ScopInfo/assume_gep_bounds_2.ll b/polly/test/ScopInfo/assume_gep_bounds_2.ll
index e327195da94c..7a8c1870abe2 100644
--- a/polly/test/ScopInfo/assume_gep_bounds_2.ll
+++ b/polly/test/ScopInfo/assume_gep_bounds_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN:  -polly-precise-inbounds | FileCheck %s
 ;
 ;    void foo(float A[restrict][20], float B[restrict][20], long n, long m,
diff --git a/polly/test/ScopInfo/assume_gep_bounds_many.ll b/polly/test/ScopInfo/assume_gep_bounds_many.ll
index 261491564fc2..01fc12cd7f10 100644
--- a/polly/test/ScopInfo/assume_gep_bounds_many.ll
+++ b/polly/test/ScopInfo/assume_gep_bounds_many.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -disable-output -polly-print-scops -polly-ignore-aliasing \
-; RUN:    < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' -polly-ignore-aliasing \
+; RUN:    < %s 2>&1 | FileCheck %s
 
 ; CHECK: Assumed Context:
 ; CHECK-NEXT: [n1_a, n1_b, n1_c, n1_d, n2_a, n2_b, n2_c, n2_d, n3_a, n3_b, n3_c, n3_d, n4_a, n4_b, n4_c, n4_d, n5_a, n5_b, n5_c, n5_d, n6_a, n6_b, n6_c, n6_d, n7_a, n7_b, n7_c, n7_d, n8_a, n8_b, n8_c, n8_d, n9_a, n9_b, n9_c, n9_d, p1_b, p1_c, p1_d, p2_b, p2_c, p2_d, p3_b, p3_c, p3_d, p4_b, p4_c, p4_d, p5_b, p5_c, p5_d, p6_b, p6_c, p6_d, p7_b, p7_c, p7_d, p8_b, p8_c, p8_d, p9_b, p9_c, p9_d] -> {  : p1_b >= n1_b and p1_c >= n1_c and p1_d >= n1_d and p2_b >= n2_b and p2_c >= n2_c and p2_d >= n2_d and p3_b >= n3_b and p3_c >= n3_c and p3_d >= n3_d and p4_b >= n4_b and p4_c >= n4_c and p4_d >= n4_d and p5_b >= n5_b and p5_c >= n5_c and p5_d >= n5_d and p6_b >= n6_b and p6_c >= n6_c and p6_d >= n6_d and p7_b >= n7_b and p7_c >= n7_c and p7_d >= n7_d and p8_b >= n8_b and p8_c >= n8_c and p8_d >= n8_d and p9_b >= n9_b and p9_c >= n9_c and p9_d >= n9_d }
diff --git a/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll b/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll
index 0e17eb1d3668..3fb7a1329c74 100644
--- a/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll
+++ b/polly/test/ScopInfo/avoid_new_parameters_from_geps.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do no introduce a parameter here that is actually not needed.
 ;
diff --git a/polly/test/ScopInfo/bool-addrec.ll b/polly/test/ScopInfo/bool-addrec.ll
index 1924a4b5266b..81fcade08f65 100644
--- a/polly/test/ScopInfo/bool-addrec.ll
+++ b/polly/test/ScopInfo/bool-addrec.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -disable-output -polly-print-ast -polly-process-unprofitable < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-ast>' -polly-process-unprofitable < %s 2>&1 | FileCheck %s
 
 ; CHECK:      for (int c0 = 0; c0 <= 19999; c0 += 1) {
 ; CHECK-NEXT:   if (c0 % 2 == 0)
diff --git a/polly/test/ScopInfo/bounded_loop_assumptions.ll b/polly/test/ScopInfo/bounded_loop_assumptions.ll
index d472c7586c53..5628092de776 100644
--- a/polly/test/ScopInfo/bounded_loop_assumptions.ll
+++ b/polly/test/ScopInfo/bounded_loop_assumptions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The assumed context is tricky here as the equality test for the inner loop
 ; allows an "unbounded" loop trip count. We assume that does not happen, thus
diff --git a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll
index 5c5f264aab60..83743e4e4ecc 100644
--- a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll
+++ b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-2.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | \
 ; RUN:     FileCheck %s -check-prefix=DETECT
 
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | \
 ; RUN:     FileCheck %s -check-prefix=SCOP
 
 ; DETECT: Valid Region for Scop: loop => barrier
diff --git a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll
index d69d3a16c0d7..9685ba37a49a 100644
--- a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll
+++ b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations-3.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | \
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | \
 ; RUN:     FileCheck %s -check-prefix=NONAFFINE
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output \
-; RUN:     -polly-allow-nonaffine-branches=false < %s | \
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output \
+; RUN:     -polly-allow-nonaffine-branches=false < %s 2>&1 | \
 ; RUN:     FileCheck %s -check-prefix=NO-NONEAFFINE
 
 ; NONAFFINE:      Statements {
diff --git a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll
index 57918fa5c92d..f41e6500fb30 100644
--- a/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll
+++ b/polly/test/ScopInfo/branch-references-loop-scev-with-unknown-iterations.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | \
 ; RUN:     FileCheck %s -check-prefix=NONAFFINE
-; RUN: opt %loadPolly -polly-print-scops -disable-output \
-; RUN:     -polly-allow-nonaffine-branches=false < %s | \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \
+; RUN:     -polly-allow-nonaffine-branches=false < %s 2>&1 | \
 ; RUN:     FileCheck %s -check-prefix=NO-NONEAFFINE
 
 ; NONAFFINE-NOT: Statements
diff --git a/polly/test/ScopInfo/bug_2010_10_22.ll b/polly/test/ScopInfo/bug_2010_10_22.ll
index 7ba996b6d0f1..71e7051922b5 100644
--- a/polly/test/ScopInfo/bug_2010_10_22.ll
+++ b/polly/test/ScopInfo/bug_2010_10_22.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/ScopInfo/bug_2011_1_5.ll b/polly/test/ScopInfo/bug_2011_1_5.ll
index 95c25f9d9cdb..f4a24e06f46a 100644
--- a/polly/test/ScopInfo/bug_2011_1_5.ll
+++ b/polly/test/ScopInfo/bug_2011_1_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 
 ; Bug description: Alias Analysis thinks IntToPtrInst aliases with alloca instructions created by IndependentBlocks Pass.
 ;                  This will trigger the assertion when we are verifying the SCoP after IndependentBlocks.
diff --git a/polly/test/ScopInfo/bug_scev_not_fully_eval.ll b/polly/test/ScopInfo/bug_scev_not_fully_eval.ll
index 89d5f318829e..ed6bbafdac1f 100644
--- a/polly/test/ScopInfo/bug_scev_not_fully_eval.ll
+++ b/polly/test/ScopInfo/bug_scev_not_fully_eval.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | not FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | not FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 @edge.8265 = external global [72 x i32], align 32 ; <ptr> [#uses=1]
diff --git a/polly/test/ScopInfo/cfg_consequences.ll b/polly/test/ScopInfo/cfg_consequences.ll
index 84f94b135735..9161d3db4167 100644
--- a/polly/test/ScopInfo/cfg_consequences.ll
+++ b/polly/test/ScopInfo/cfg_consequences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; void consequences(int *A, int bool_cond, int lhs, int rhs) {
 ;
diff --git a/polly/test/ScopInfo/complex-branch-structure.ll b/polly/test/ScopInfo/complex-branch-structure.ll
index 24ebdcf213f8..de79c2226e68 100644
--- a/polly/test/ScopInfo/complex-branch-structure.ll
+++ b/polly/test/ScopInfo/complex-branch-structure.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-print-scops \
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
 ; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 
 ; We build a scop of the following form to check that the domain construction
diff --git a/polly/test/ScopInfo/complex-condition.ll b/polly/test/ScopInfo/complex-condition.ll
index 31d34b033725..c3b8d2bb0ef8 100644
--- a/polly/test/ScopInfo/complex-condition.ll
+++ b/polly/test/ScopInfo/complex-condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-print-scops \
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
 ; RUN:     -polly-invariant-load-hoisting=true \
 ; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 ;
diff --git a/polly/test/ScopInfo/complex-expression.ll b/polly/test/ScopInfo/complex-expression.ll
index 1822c9de852a..6a6dde62d1ae 100644
--- a/polly/test/ScopInfo/complex-expression.ll
+++ b/polly/test/ScopInfo/complex-expression.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-print-scops \
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
 ; RUN:     -polly-invariant-load-hoisting=true \
 ; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 ;
diff --git a/polly/test/ScopInfo/complex-loop-nesting.ll b/polly/test/ScopInfo/complex-loop-nesting.ll
index 97a9bfd939d5..36cb078f19ff 100644
--- a/polly/test/ScopInfo/complex-loop-nesting.ll
+++ b/polly/test/ScopInfo/complex-loop-nesting.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/complex-successor-structure-2.ll b/polly/test/ScopInfo/complex-successor-structure-2.ll
index 6bb7bb14a8cc..f4a78bf75385 100644
--- a/polly/test/ScopInfo/complex-successor-structure-2.ll
+++ b/polly/test/ScopInfo/complex-successor-structure-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-print-scops \
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
 ; RUN:     -polly-invariant-load-hoisting=true \
 ; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 
diff --git a/polly/test/ScopInfo/complex-successor-structure-3.ll b/polly/test/ScopInfo/complex-successor-structure-3.ll
index 14c3fc1babeb..6da1fe3a8b9f 100644
--- a/polly/test/ScopInfo/complex-successor-structure-3.ll
+++ b/polly/test/ScopInfo/complex-successor-structure-3.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -disable-output -polly-print-scops \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 ;
 ; Check that propagation of domains from A(X) to A(X+1) will keep the
 ; domains small and concise.
diff --git a/polly/test/ScopInfo/complex-successor-structure.ll b/polly/test/ScopInfo/complex-successor-structure.ll
index 364344045a6a..6c87ba3e9850 100644
--- a/polly/test/ScopInfo/complex-successor-structure.ll
+++ b/polly/test/ScopInfo/complex-successor-structure.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-print-scops \
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
 ; RUN:     -polly-invariant-load-hoisting=true \
 ; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 
diff --git a/polly/test/ScopInfo/complex_domain_binary_condition.ll b/polly/test/ScopInfo/complex_domain_binary_condition.ll
index cec26855debb..6091e3be4560 100644
--- a/polly/test/ScopInfo/complex_domain_binary_condition.ll
+++ b/polly/test/ScopInfo/complex_domain_binary_condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-scops \
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
 ; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Low complexity assumption: {  : false }
diff --git a/polly/test/ScopInfo/complex_execution_context.ll b/polly/test/ScopInfo/complex_execution_context.ll
index 164254308fa9..9880a1dd67d1 100644
--- a/polly/test/ScopInfo/complex_execution_context.ll
+++ b/polly/test/ScopInfo/complex_execution_context.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-print-scops \
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
 ; RUN:     -polly-invariant-load-hoisting=true \
 ; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 ;
diff --git a/polly/test/ScopInfo/cond_constant_in_loop.ll b/polly/test/ScopInfo/cond_constant_in_loop.ll
index ef7d857e1084..552fddc6ff08 100644
--- a/polly/test/ScopInfo/cond_constant_in_loop.ll
+++ b/polly/test/ScopInfo/cond_constant_in_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ;void f(long a[], long N, long M) {
 ;  long i, j, k;
diff --git a/polly/test/ScopInfo/cond_in_loop.ll b/polly/test/ScopInfo/cond_in_loop.ll
index 2d435f6a6a93..c06dcd955bac 100644
--- a/polly/test/ScopInfo/cond_in_loop.ll
+++ b/polly/test/ScopInfo/cond_in_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ;void f(long a[], long N, long M) {
 ;  long i, j, k;
diff --git a/polly/test/ScopInfo/condition-after-error-block-2.ll b/polly/test/ScopInfo/condition-after-error-block-2.ll
index 695d864e483c..8c4b2170ad69 100644
--- a/polly/test/ScopInfo/condition-after-error-block-2.ll
+++ b/polly/test/ScopInfo/condition-after-error-block-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Verify that we do not allow PHI nodes such as %phi, if they reference an error
 ; block and are used by anything else than a terminator instruction.
diff --git a/polly/test/ScopInfo/condition-after-error-block-before-scop.ll b/polly/test/ScopInfo/condition-after-error-block-before-scop.ll
index 184be3642f0c..d5069da916fa 100644
--- a/polly/test/ScopInfo/condition-after-error-block-before-scop.ll
+++ b/polly/test/ScopInfo/condition-after-error-block-before-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/ScopInfo/condtion-after-error-block.ll b/polly/test/ScopInfo/condtion-after-error-block.ll
index 92e743e2d879..d9de4fc40a20 100644
--- a/polly/test/ScopInfo/condtion-after-error-block.ll
+++ b/polly/test/ScopInfo/condtion-after-error-block.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Verify that we allow scops containing uniform branch conditions, where all
 ; but one incoming block comes from an error condition.
diff --git a/polly/test/ScopInfo/const_srem_sdiv.ll b/polly/test/ScopInfo/const_srem_sdiv.ll
index 3acca980da70..b4c2f119fe05 100644
--- a/polly/test/ScopInfo/const_srem_sdiv.ll
+++ b/polly/test/ScopInfo/const_srem_sdiv.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 ;
 ; See http://research.microsoft.com/pubs/151917/divmodnote-letter.pdf
 ;
diff --git a/polly/test/ScopInfo/constant-non-integer-branch-condition.ll b/polly/test/ScopInfo/constant-non-integer-branch-condition.ll
index fc95a4cc7891..42c3b83d47f1 100644
--- a/polly/test/ScopInfo/constant-non-integer-branch-condition.ll
+++ b/polly/test/ScopInfo/constant-non-integer-branch-condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; At some point this caused a problem in the domain generation as we
 ; assumed any constant branch condition to be valid. However, only constant
diff --git a/polly/test/ScopInfo/constant_factor_in_parameter.ll b/polly/test/ScopInfo/constant_factor_in_parameter.ll
index 1f0173c0edf9..b58d413e074e 100644
--- a/polly/test/ScopInfo/constant_factor_in_parameter.ll
+++ b/polly/test/ScopInfo/constant_factor_in_parameter.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -disable-output -polly-print-scops < %s | FileCheck %s
-; RUN: opt %loadPolly -disable-output -polly-print-function-scops < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -disable-output '-passes=print<polly-function-scops>' < %s 2>&1 | FileCheck %s
 ;
 ; Check that the constant part of the N * M * 4 expression is not part of the
 ; parameter but explicit in the access function. This can avoid existentially
diff --git a/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll b/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll
index 38b2b8958e2f..62e6cd4641de 100644
--- a/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll
+++ b/polly/test/ScopInfo/constant_functions_outside_scop_as_unknown.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
diff --git a/polly/test/ScopInfo/constant_start_integer.ll b/polly/test/ScopInfo/constant_start_integer.ll
index aa6640c98f73..8991f8250f0b 100644
--- a/polly/test/ScopInfo/constant_start_integer.ll
+++ b/polly/test/ScopInfo/constant_start_integer.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(float *input) {
diff --git a/polly/test/ScopInfo/debug_call.ll b/polly/test/ScopInfo/debug_call.ll
index 93b5bc520a00..a6761ecebe6a 100644
--- a/polly/test/ScopInfo/debug_call.ll
+++ b/polly/test/ScopInfo/debug_call.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-debug-func=dbg_printf -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-debug-func=dbg_printf '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Check that the call to dbg_printf is accepted as a debug-function.
 ;
diff --git a/polly/test/ScopInfo/delinearize-together-all-data-refs.ll b/polly/test/ScopInfo/delinearize-together-all-data-refs.ll
index 108392b27f07..676c8a27e574 100644
--- a/polly/test/ScopInfo/delinearize-together-all-data-refs.ll
+++ b/polly/test/ScopInfo/delinearize-together-all-data-refs.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
 ;   for (long i = 0; i < n-3; i++)
diff --git a/polly/test/ScopInfo/div_by_zero.ll b/polly/test/ScopInfo/div_by_zero.ll
index 2205b85a9ebc..aecd16833b84 100644
--- a/polly/test/ScopInfo/div_by_zero.ll
+++ b/polly/test/ScopInfo/div_by_zero.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/do-not-model-error-block-accesses.ll b/polly/test/ScopInfo/do-not-model-error-block-accesses.ll
index 997e0d4b37cf..baa423f40780 100644
--- a/polly/test/ScopInfo/do-not-model-error-block-accesses.ll
+++ b/polly/test/ScopInfo/do-not-model-error-block-accesses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 
 ; Check that we do not crash on this input. Earlier this indeed crashed as
 ; we tried to model the access functions in an error block.
diff --git a/polly/test/ScopInfo/eager-binary-and-or-conditions.ll b/polly/test/ScopInfo/eager-binary-and-or-conditions.ll
index e9ad63c51b85..a988b3f8c2b0 100644
--- a/polly/test/ScopInfo/eager-binary-and-or-conditions.ll
+++ b/polly/test/ScopInfo/eager-binary-and-or-conditions.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output< %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
 ;
 ; void or(float *A, long n, long m) {
 ;   for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/early_exit_for_complex_domains.ll b/polly/test/ScopInfo/early_exit_for_complex_domains.ll
index a72ea031c236..eed19b3214a7 100644
--- a/polly/test/ScopInfo/early_exit_for_complex_domains.ll
+++ b/polly/test/ScopInfo/early_exit_for_complex_domains.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 ;
 ; Check we do not crash.
 ;
diff --git a/polly/test/ScopInfo/error-blocks-1.ll b/polly/test/ScopInfo/error-blocks-1.ll
index 03353edf297a..047b095a9594 100644
--- a/polly/test/ScopInfo/error-blocks-1.ll
+++ b/polly/test/ScopInfo/error-blocks-1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Context:
 ; CHECK-NEXT:    [N] -> {  : -2147483648 <= N <= 2147483647 }
diff --git a/polly/test/ScopInfo/error-blocks-2.ll b/polly/test/ScopInfo/error-blocks-2.ll
index 29095dacacfb..6fa12947540c 100644
--- a/polly/test/ScopInfo/error-blocks-2.ll
+++ b/polly/test/ScopInfo/error-blocks-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Invariant Accesses: {
 ; CHECK-NEXT:            ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/escaping_empty_scop.ll b/polly/test/ScopInfo/escaping_empty_scop.ll
index 8837e19eefe4..2efaef3fb99b 100644
--- a/polly/test/ScopInfo/escaping_empty_scop.ll
+++ b/polly/test/ScopInfo/escaping_empty_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void g();
 ;    int f(int *A) {
diff --git a/polly/test/ScopInfo/exit-phi-1.ll b/polly/test/ScopInfo/exit-phi-1.ll
index 8e6c5fb9e211..cbd6c280e8ca 100644
--- a/polly/test/ScopInfo/exit-phi-1.ll
+++ b/polly/test/ScopInfo/exit-phi-1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-codegen -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -passes=polly-codegen -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ; Check for correct code generation of exit PHIs, even if the same PHI value
 ; is used again inside the the SCoP.
diff --git a/polly/test/ScopInfo/exit-phi-2.ll b/polly/test/ScopInfo/exit-phi-2.ll
index d218d5fa039b..695c617b14c1 100644
--- a/polly/test/ScopInfo/exit-phi-2.ll
+++ b/polly/test/ScopInfo/exit-phi-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that there is no MK_ExitPHI READ access.
 ;
diff --git a/polly/test/ScopInfo/exit_phi_accesses-2.ll b/polly/test/ScopInfo/exit_phi_accesses-2.ll
index e376f0df9d54..b3b7cb1c6599 100644
--- a/polly/test/ScopInfo/exit_phi_accesses-2.ll
+++ b/polly/test/ScopInfo/exit_phi_accesses-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK-LABEL: Function: foo
 ;
diff --git a/polly/test/ScopInfo/exit_phi_accesses.ll b/polly/test/ScopInfo/exit_phi_accesses.ll
index f4fbe31f6b24..77b038ec8e4a 100644
--- a/polly/test/ScopInfo/exit_phi_accesses.ll
+++ b/polly/test/ScopInfo/exit_phi_accesses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Check that PHI nodes only create PHI access and nothing else (e.g. unnecessary
 ; SCALAR accesses). In this case, for a PHI in the exit node, hence there is no
diff --git a/polly/test/ScopInfo/expensive-boundary-context.ll b/polly/test/ScopInfo/expensive-boundary-context.ll
index 7001b96acd21..1a8858d8fce2 100644
--- a/polly/test/ScopInfo/expensive-boundary-context.ll
+++ b/polly/test/ScopInfo/expensive-boundary-context.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output \
-; RUN:                < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output \
+; RUN:                < %s 2>&1 | FileCheck %s
 
 ; CHECK-NOT:   Assumed Context:
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll b/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll
index 89ca344fdf54..5e833e7ae0f4 100644
--- a/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll
+++ b/polly/test/ScopInfo/extract_constant_factor_introduces_new_parameter.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 
 ; CHECK: Valid Region for Scop: bb10 => bb16
 
diff --git a/polly/test/ScopInfo/full-function.ll b/polly/test/ScopInfo/full-function.ll
index 670472576fe7..596c3d0af66a 100644
--- a/polly/test/ScopInfo/full-function.ll
+++ b/polly/test/ScopInfo/full-function.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output -polly-detect-full-functions < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-detect-full-functions < %s 2>&1 \
 ; RUN: | FileCheck %s -check-prefix=FULL
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN: | FileCheck %s -check-prefix=WITHOUT-FULL
 
 ; FULL:      Region: %bb---FunctionExit
diff --git a/polly/test/ScopInfo/granularity_same_name.ll b/polly/test/ScopInfo/granularity_same_name.ll
index 1ebf5c6f71a2..17f75fbf8a97 100644
--- a/polly/test/ScopInfo/granularity_same_name.ll
+++ b/polly/test/ScopInfo/granularity_same_name.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb           -polly-use-llvm-names=0 -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines -check-prefix=IDX
-; RUN: opt %loadPolly -polly-stmt-granularity=bb           -polly-use-llvm-names=1 -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines -check-prefix=BB
-; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=0 -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines -check-prefix=IDX
-; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=1 -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines -check-prefix=BB
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb           -polly-use-llvm-names=0 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=IDX
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb           -polly-use-llvm-names=1 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=BB
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=0 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=IDX
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-use-llvm-names=1 '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines -check-prefix=BB
 ;
 ; Check that the statement has the same name, regardless of how the
 ; basic block is split into multiple statements.
diff --git a/polly/test/ScopInfo/granularity_scalar-indep.ll b/polly/test/ScopInfo/granularity_scalar-indep.ll
index fe509b468272..5c4484f9d457 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Split a block into two independent statements that share no scalar.
 ; This case has the instructions of the two statements interleaved, such that
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll
index 56bc11aed28d..7ae0d961b38f 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Two PHIs, cross-referencing each other. The PHI READs must be carried-out
 ; before the PHI WRITEs to ensure that the value when entering the block is
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll
index f46cf4e6a0a2..7839e51c163a 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_cross-referencing-phi2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Two PHIs, cross-referencing each other. The PHI READs must be carried-out
 ; before the PHI WRITEs to ensure that the value when entering the block is
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll b/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll
index e202e38f0844..8643e85e0559 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_epilogue.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Split a block into two independent statements that share no scalar.
 ; This case has an independent statement just for PHI writes.
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll b/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll
index 40af34bfb067..bc71cbe45cd9 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_epilogue_last.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; Check that the PHI Write of value that is defined in the same basic
 ; block is in the statement where it is defined.
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll b/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll
index 9a0d207c0c2a..f3864bac519b 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_noepilogue.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; This case has no explicit epilogue for PHI writes because it would
 ; have a scalar dependency to the previous statement.
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll b/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll
index d093806bc9cc..43101a8a0abf 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_ordered-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; This case should be split into two statements because {X[0], Y[0]}
 ; and {A[0], B[0]} do not intersect.
diff --git a/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll b/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll
index b1d2936882aa..4974f7e9b28c 100644
--- a/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll
+++ b/polly/test/ScopInfo/granularity_scalar-indep_ordered.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ;
 ; This case cannot be split into two statements because the order of
 ; loads and store would be violated.
diff --git a/polly/test/ScopInfo/i1_params.ll b/polly/test/ScopInfo/i1_params.ll
index 1cb1329b08f9..be3e28737201 100644
--- a/polly/test/ScopInfo/i1_params.ll
+++ b/polly/test/ScopInfo/i1_params.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that both a signed as well as an unsigned extended i1 parameter
 ; is represented correctly.
diff --git a/polly/test/ScopInfo/infeasible-rtc.ll b/polly/test/ScopInfo/infeasible-rtc.ll
index ef96627e640e..7a0bfe0fa4d8 100644
--- a/polly/test/ScopInfo/infeasible-rtc.ll
+++ b/polly/test/ScopInfo/infeasible-rtc.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \
 ; RUN:  | FileCheck %s -check-prefix=DETECT
 
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN:  | FileCheck %s -check-prefix=SCOPS
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
diff --git a/polly/test/ScopInfo/infeasible_invalid_context.ll b/polly/test/ScopInfo/infeasible_invalid_context.ll
index 2c299f06c12e..006901ab05b7 100644
--- a/polly/test/ScopInfo/infeasible_invalid_context.ll
+++ b/polly/test/ScopInfo/infeasible_invalid_context.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \
 ; RUN:  | FileCheck %s -check-prefix=DETECT
 
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN:  | FileCheck %s -check-prefix=SCOPS
 
 ; DETECT: Valid Region for Scop: if.end116 => for.inc216
diff --git a/polly/test/ScopInfo/int2ptr_ptr2int.ll b/polly/test/ScopInfo/int2ptr_ptr2int.ll
index 9fadc5a8eb28..f6668ecdd089 100644
--- a/polly/test/ScopInfo/int2ptr_ptr2int.ll
+++ b/polly/test/ScopInfo/int2ptr_ptr2int.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -S -polly-codegen < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen < %s 2>&1 | FileCheck %s --check-prefix=IR
 ;
 ;    void f(long *A, long *ptr, long val) {
 ;      for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/int2ptr_ptr2int_2.ll b/polly/test/ScopInfo/int2ptr_ptr2int_2.ll
index 97878f7091b1..361bf5a95761 100644
--- a/polly/test/ScopInfo/int2ptr_ptr2int_2.ll
+++ b/polly/test/ScopInfo/int2ptr_ptr2int_2.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-print-scops \
-; RUN: -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -S -polly-codegen \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' \
+; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -passes=polly-codegen \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s --check-prefix=IR
 ;
 ;    void f(long *A, long *B, long *ptr, long val) {
 ;      for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/integers.ll b/polly/test/ScopInfo/integers.ll
index b608bf84cffa..4f6d1117e2bc 100644
--- a/polly/test/ScopInfo/integers.ll
+++ b/polly/test/ScopInfo/integers.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Check that we correctly convert integers to isl values.
 
diff --git a/polly/test/ScopInfo/inter-error-bb-dependence.ll b/polly/test/ScopInfo/inter-error-bb-dependence.ll
index 4e23de7e6a99..761fcbbe3435 100644
--- a/polly/test/ScopInfo/inter-error-bb-dependence.ll
+++ b/polly/test/ScopInfo/inter-error-bb-dependence.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-print-scops -disable-output < %s 2>&1 > /dev/null | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 > /dev/null | FileCheck %s
 ;
 ; Error statements (%bb33) do not require their uses to be verified.
 ; In this case it uses %tmp32 from %bb31 which is not available because
diff --git a/polly/test/ScopInfo/inter_bb_scalar_dep.ll b/polly/test/ScopInfo/inter_bb_scalar_dep.ll
index 456f7a773f04..7313618b082b 100644
--- a/polly/test/ScopInfo/inter_bb_scalar_dep.ll
+++ b/polly/test/ScopInfo/inter_bb_scalar_dep.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops \
-; RUN: -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' \
+; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], int N, int *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll b/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll
index 859972b27402..d2ed3c17fe9d 100644
--- a/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll
+++ b/polly/test/ScopInfo/intra-non-affine-stmt-phi-node.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output \
-; RUN:     < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
+; RUN:     < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Statements {
 ; CHECK-NEXT:     Stmt_loop__TO__backedge
diff --git a/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll b/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll
index 37f4e0513ed3..b3286cd2a724 100644
--- a/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll
+++ b/polly/test/ScopInfo/intra_and_inter_bb_scalar_dep.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], int N, int *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/intra_bb_scalar_dep.ll b/polly/test/ScopInfo/intra_bb_scalar_dep.ll
index 0252273d3107..86855e7499a5 100644
--- a/polly/test/ScopInfo/intra_bb_scalar_dep.ll
+++ b/polly/test/ScopInfo/intra_bb_scalar_dep.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], int N, int *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/intrinsics.ll b/polly/test/ScopInfo/intrinsics.ll
index 853429341381..c5bbacbe6d8c 100644
--- a/polly/test/ScopInfo/intrinsics.ll
+++ b/polly/test/ScopInfo/intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-print-instructions -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-print-instructions -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we remove the ignored intrinsics from the instruction list.
 ;
diff --git a/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll b/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll
index 8d0de03e9866..723942668d8c 100644
--- a/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll
+++ b/polly/test/ScopInfo/invalid_add_rec_after_invariant_load_remapping.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 ;
 ; This crashed at some point as we place %1 and %4 in the same equivalence class
 ; for invariant loads and when we remap SCEVs to use %4 instead of %1 AddRec SCEVs
diff --git a/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll b/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll
index dcb0ad301ba3..c493c22af32d 100644
--- a/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll
+++ b/polly/test/ScopInfo/invalidate_iterator_during_MA_removal.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 ;
 ; Check that no invalidated iterator is accessed while elements from
 ; the list of MemoryAccesses are removed.
diff --git a/polly/test/ScopInfo/invariant-load-instlist.ll b/polly/test/ScopInfo/invariant-load-instlist.ll
index 7f4cf050f064..ecb80e4054c3 100644
--- a/polly/test/ScopInfo/invariant-load-instlist.ll
+++ b/polly/test/ScopInfo/invariant-load-instlist.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 
 ; The load is a required invariant load and at the same time used in a store.
 ; Polly used to add two MemoryAccesses for it which caused an assertion to fail.
diff --git a/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll b/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll
index b97fe22e076e..89eac6ce69a1 100644
--- a/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll
+++ b/polly/test/ScopInfo/invariant-loads-leave-read-only-statements.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
 
 ; CHECK:      Statements {
 ; CHECK-NEXT: 	Stmt_L_4
diff --git a/polly/test/ScopInfo/invariant_load.ll b/polly/test/ScopInfo/invariant_load.ll
index fcea77e19b85..9dc064276c40 100644
--- a/polly/test/ScopInfo/invariant_load.ll
+++ b/polly/test/ScopInfo/invariant_load.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll
index 100a8db2a9d1..40aa3098683b 100644
--- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll
+++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ;    struct {
 ;      int a;
diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll
index e31deb6fd472..287676024079 100644
--- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll
+++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_escaping.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ;    struct {
 ;      int a;
diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll
index bbf6d69a5fbb..cb745b4920b8 100644
--- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll
+++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ;    int U;
 ;    void f(int *A) {
diff --git a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll
index 011c2fe3d549..fa5429d4803a 100644
--- a/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll
+++ b/polly/test/ScopInfo/invariant_load_access_classes_different_base_type_same_pointer_escaping.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ;    int U;
 ;    int f(int *A) {
diff --git a/polly/test/ScopInfo/invariant_load_addrec_sum.ll b/polly/test/ScopInfo/invariant_load_addrec_sum.ll
index 09b158d342ed..2e639f7d5e33 100644
--- a/polly/test/ScopInfo/invariant_load_addrec_sum.ll
+++ b/polly/test/ScopInfo/invariant_load_addrec_sum.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Region: %entry.split---%if.end
 ; CHECK:     Invariant Accesses: {
diff --git a/polly/test/ScopInfo/invariant_load_base_pointer.ll b/polly/test/ScopInfo/invariant_load_base_pointer.ll
index ddf11d892adb..f2539af97a0b 100644
--- a/polly/test/ScopInfo/invariant_load_base_pointer.ll
+++ b/polly/test/ScopInfo/invariant_load_base_pointer.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll b/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll
index 07f2c3768b0a..f854b1f48ea9 100644
--- a/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll
+++ b/polly/test/ScopInfo/invariant_load_base_pointer_conditional.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll b/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll
index d66d718d492a..5a9c5c6cabbe 100644
--- a/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll
+++ b/polly/test/ScopInfo/invariant_load_base_pointer_in_conditional.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_branch_condition.ll b/polly/test/ScopInfo/invariant_load_branch_condition.ll
index 4f49d2969d86..d12750c30ba9 100644
--- a/polly/test/ScopInfo/invariant_load_branch_condition.ll
+++ b/polly/test/ScopInfo/invariant_load_branch_condition.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output \
-; RUN:     -polly-invariant-load-hoisting < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
+; RUN:     -polly-invariant-load-hoisting < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Invariant Accesses: {
 ; CHECK-NEXT:         ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll
index c6a7faf2e355..34d50a18663c 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN:  -polly-invariant-load-hoisting \
 ; RUN:  | FileCheck %s
 
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll
index 921dd4fbde5c..51f3cf6c095a 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN:  -polly-invariant-load-hoisting \
 ; RUN:  | FileCheck %s
 
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll
index c15d11ca865d..3a742bbccdf1 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN:  -polly-invariant-load-hoisting \
 ; RUN:  | FileCheck %s
 
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll
index 0495a330792c..6bd8b3146e87 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN:  -polly-invariant-load-hoisting \
 ; RUN:  | FileCheck %s
 
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll
index 9144fcf186c3..cb7e5646fc2b 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4b.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN:  -polly-invariant-load-hoisting \
 ; RUN:  | FileCheck %s
 
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll
index aefacff6b46f..6f7fbacc089c 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_4c.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN:  -polly-invariant-load-hoisting \
 ; RUN:  | FileCheck %s
 
diff --git a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll
index ecc0c0a23014..445832822bdf 100644
--- a/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll
+++ b/polly/test/ScopInfo/invariant_load_canonicalize_array_baseptrs_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 \
 ; RUN:  -polly-invariant-load-hoisting \
 ; RUN:  | FileCheck %s
 
diff --git a/polly/test/ScopInfo/invariant_load_complex_condition.ll b/polly/test/ScopInfo/invariant_load_complex_condition.ll
index e721c222db5f..11e7088d68db 100644
--- a/polly/test/ScopInfo/invariant_load_complex_condition.ll
+++ b/polly/test/ScopInfo/invariant_load_complex_condition.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -S -polly-print-scops -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -S '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/invariant_load_condition.ll b/polly/test/ScopInfo/invariant_load_condition.ll
index 84546984709e..c7d7b3c9ba61 100644
--- a/polly/test/ScopInfo/invariant_load_condition.ll
+++ b/polly/test/ScopInfo/invariant_load_condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_dereferenceable.ll b/polly/test/ScopInfo/invariant_load_dereferenceable.ll
index adba32d8d463..526bdc6ddb3b 100644
--- a/polly/test/ScopInfo/invariant_load_dereferenceable.ll
+++ b/polly/test/ScopInfo/invariant_load_dereferenceable.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-print-detect -polly-print-scops \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' '-passes=print<polly-function-scops>' \
 ; RUN: -polly-invariant-load-hoisting=true \
-; RUN: -disable-output < %s | FileCheck %s
+; RUN: -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK-NOT: Function: foo_undereferanceable
 
diff --git a/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll b/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll
index 60b4a1daa824..eb148063320e 100644
--- a/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll
+++ b/polly/test/ScopInfo/invariant_load_distinct_parameter_valuations.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not consolidate the invariant loads to smp[order - 1] and
 ; smp[order - 2] in the blocks %0 and %16. While they have the same pointer
diff --git a/polly/test/ScopInfo/invariant_load_in_non_affine.ll b/polly/test/ScopInfo/invariant_load_in_non_affine.ll
index d00bc2d642e0..5261113f5a0c 100644
--- a/polly/test/ScopInfo/invariant_load_in_non_affine.ll
+++ b/polly/test/ScopInfo/invariant_load_in_non_affine.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output \
-; RUN:   -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output \
+; RUN:   -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 ;
 ; CHECK-NOT: Valid Region for Scop
 ;
diff --git a/polly/test/ScopInfo/invariant_load_loop_ub.ll b/polly/test/ScopInfo/invariant_load_loop_ub.ll
index 856b6e4dd508..ee889e6c4d5a 100644
--- a/polly/test/ScopInfo/invariant_load_loop_ub.ll
+++ b/polly/test/ScopInfo/invariant_load_loop_ub.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-process-unprofitable -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll b/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll
index 69463d420aca..6af7caecc0b3 100644
--- a/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll
+++ b/polly/test/ScopInfo/invariant_load_ptr_ptr_noalias.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -tbaa -polly-print-scops -polly-invariant-load-hoisting=true -polly-ignore-aliasing \
-; RUN:                -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=tbaa '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -polly-ignore-aliasing \
+; RUN:                -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Note: The order of the invariant accesses is important because A is the
 ;       base pointer of tmp3 and we will generate code in the same order as
diff --git a/polly/test/ScopInfo/invariant_load_scalar_dep.ll b/polly/test/ScopInfo/invariant_load_scalar_dep.ll
index 79a10426862a..319f24bdcb92 100644
--- a/polly/test/ScopInfo/invariant_load_scalar_dep.ll
+++ b/polly/test/ScopInfo/invariant_load_scalar_dep.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Invariant Accesses:
 ; CHECK-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_load_stmt_domain.ll b/polly/test/ScopInfo/invariant_load_stmt_domain.ll
index 6cd71c85ea2f..715948062c05 100644
--- a/polly/test/ScopInfo/invariant_load_stmt_domain.ll
+++ b/polly/test/ScopInfo/invariant_load_stmt_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 
 ; This test case verifies that the statement domain of the invariant access
 ; is the universe. In earlier versions of Polly, we accidentally computed an
diff --git a/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll b/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll
index e77515280241..a6108320d560 100644
--- a/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll
+++ b/polly/test/ScopInfo/invariant_load_zext_parameter-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -scalar-evolution-max-value-compare-depth=3 -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -scalar-evolution-max-value-compare-depth=3 -polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
+; RUN: opt %loadNPMPolly -scalar-evolution-max-value-compare-depth=3 '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -scalar-evolution-max-value-compare-depth=3 -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
 ;
 ; Stress test for the code generation of invariant accesses.
 ;
diff --git a/polly/test/ScopInfo/invariant_load_zext_parameter.ll b/polly/test/ScopInfo/invariant_load_zext_parameter.ll
index 1bde70282d44..e3c183aab5e2 100644
--- a/polly/test/ScopInfo/invariant_load_zext_parameter.ll
+++ b/polly/test/ScopInfo/invariant_load_zext_parameter.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s --check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=CODEGEN
 ;
 ;    void f(int *I0, int *I1, int *V) {
 ;      for (int i = 0; i < 1000; i++) {
diff --git a/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll b/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll
index 775369e55c92..b5168e912ed7 100644
--- a/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll
+++ b/polly/test/ScopInfo/invariant_load_zextended_in_own_execution_context.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -disable-output < %s
 ;
 ; CHECK: Execution Context: [p_0_loaded_from_currpc] -> {  :  }
 ;
diff --git a/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll b/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll
index 1d54ccc69023..85360821078d 100644
--- a/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll
+++ b/polly/test/ScopInfo/invariant_loads_complicated_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Invariant Accesses: {
 ; CHECK-NEXT:         ReadAccess :=    [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll b/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll
index e97de0c936bc..134eac22bff5 100644
--- a/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll
+++ b/polly/test/ScopInfo/invariant_loads_cyclic_dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Negative test. If we assume UB[*V] to be invariant we get a cyclic
 ; dependence in the invariant loads that needs to be resolved by
diff --git a/polly/test/ScopInfo/invariant_loop_bounds.ll b/polly/test/ScopInfo/invariant_loop_bounds.ll
index 4e1fd88fac30..f22199cfe494 100644
--- a/polly/test/ScopInfo/invariant_loop_bounds.ll
+++ b/polly/test/ScopInfo/invariant_loop_bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Invariant Accesses: {
 ; CHECK-NEXT:         ReadAccess :=    [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll
index 3d5737bbe168..a473ef30376c 100644
--- a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll
+++ b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we only have one parameter and one invariant load for all
 ; three loads that occure in the region but actually access the same
diff --git a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll
index e2de503eb83f..66a0bc631b1d 100644
--- a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll
+++ b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we only have one parameter and one invariant load for all
 ; three loads that occure in the region but actually access the same
diff --git a/polly/test/ScopInfo/isl_aff_out_of_bounds.ll b/polly/test/ScopInfo/isl_aff_out_of_bounds.ll
index ca1b235be358..2df96faf7624 100644
--- a/polly/test/ScopInfo/isl_aff_out_of_bounds.ll
+++ b/polly/test/ScopInfo/isl_aff_out_of_bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-detect < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s 2>&1
 
 ; Used to fail with:
 ; ../../isl/isl_aff.c:591: position out of bounds
diff --git a/polly/test/ScopInfo/isl_trip_count_01.ll b/polly/test/ScopInfo/isl_trip_count_01.ll
index fc6b79c5a68a..480b6e9574a6 100644
--- a/polly/test/ScopInfo/isl_trip_count_01.ll
+++ b/polly/test/ScopInfo/isl_trip_count_01.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: [M, N] -> { Stmt_while_body[i0] : i0 > 0 and 4i0 <= -M + N; Stmt_while_body[0] };
 ;
diff --git a/polly/test/ScopInfo/isl_trip_count_02.ll b/polly/test/ScopInfo/isl_trip_count_02.ll
index 9376cb415cec..b78fb838edd0 100644
--- a/polly/test/ScopInfo/isl_trip_count_02.ll
+++ b/polly/test/ScopInfo/isl_trip_count_02.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; TODO: We do not allow unbounded loops at the moment.
 ;
diff --git a/polly/test/ScopInfo/isl_trip_count_03.ll b/polly/test/ScopInfo/isl_trip_count_03.ll
index f5b0048a0e0e..96df05f89bcf 100644
--- a/polly/test/ScopInfo/isl_trip_count_03.ll
+++ b/polly/test/ScopInfo/isl_trip_count_03.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Test comes from a bug (15771) or better a feature request. It was not allowed
 ; in Polly in the old domain generation as ScalarEvolution cannot figure out the
diff --git a/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll b/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll
index 91bc19e2de44..fd310ececaa3 100644
--- a/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll
+++ b/polly/test/ScopInfo/isl_trip_count_multiple_exiting_blocks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/licm_reduction_nested.ll b/polly/test/ScopInfo/licm_reduction_nested.ll
index a3ba478cd9ff..c1676033fa90 100644
--- a/polly/test/ScopInfo/licm_reduction_nested.ll
+++ b/polly/test/ScopInfo/licm_reduction_nested.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -loop-rotate -indvars       -polly-prepare -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -basic-aa -loop-rotate -indvars -licm -polly-prepare -polly-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -loop-rotate -indvars       -passes=polly-prepare '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -loop-rotate -indvars -licm -passes=polly-prepare '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; XFAIL: *
 ;
diff --git a/polly/test/ScopInfo/long-compile-time-alias-analysis.ll b/polly/test/ScopInfo/long-compile-time-alias-analysis.ll
index 1cbecf086968..f102518da526 100644
--- a/polly/test/ScopInfo/long-compile-time-alias-analysis.ll
+++ b/polly/test/ScopInfo/long-compile-time-alias-analysis.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 
 ; Verify that the compilation of this test case does not take infinite time.
 ; At some point Polly tried to model this test case and got stuck in
diff --git a/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll b/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll
index c88ea1327389..6027975b563b 100644
--- a/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll
+++ b/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/ScopInfo/long-sequence-of-error-blocks.ll b/polly/test/ScopInfo/long-sequence-of-error-blocks.ll
index 5b6ea9cc212d..4ef5ef09c44b 100644
--- a/polly/test/ScopInfo/long-sequence-of-error-blocks.ll
+++ b/polly/test/ScopInfo/long-sequence-of-error-blocks.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/polly/test/ScopInfo/loop-multiexit-succ-cond.ll b/polly/test/ScopInfo/loop-multiexit-succ-cond.ll
index 350db05c6dc0..431c907857fe 100644
--- a/polly/test/ScopInfo/loop-multiexit-succ-cond.ll
+++ b/polly/test/ScopInfo/loop-multiexit-succ-cond.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | FileCheck %s --check-prefix=IR
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/loop_affine_bound_0.ll b/polly/test/ScopInfo/loop_affine_bound_0.ll
index 33f49df7780f..918d4099740c 100644
--- a/polly/test/ScopInfo/loop_affine_bound_0.ll
+++ b/polly/test/ScopInfo/loop_affine_bound_0.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long a[][128], long N, long M) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/loop_affine_bound_1.ll b/polly/test/ScopInfo/loop_affine_bound_1.ll
index 38e47b74465b..8f7a87f1c5ac 100644
--- a/polly/test/ScopInfo/loop_affine_bound_1.ll
+++ b/polly/test/ScopInfo/loop_affine_bound_1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output< %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ;void f(long a[][128], long N, long M) {
 ;  long i, j;
diff --git a/polly/test/ScopInfo/loop_affine_bound_2.ll b/polly/test/ScopInfo/loop_affine_bound_2.ll
index e34662f4e6ab..2d9f997a0767 100644
--- a/polly/test/ScopInfo/loop_affine_bound_2.ll
+++ b/polly/test/ScopInfo/loop_affine_bound_2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long a[][128], long N, long M) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/loop_carry.ll b/polly/test/ScopInfo/loop_carry.ll
index f7c1dca0919c..20ebbfbc8b49 100644
--- a/polly/test/ScopInfo/loop_carry.ll
+++ b/polly/test/ScopInfo/loop_carry.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
diff --git a/polly/test/ScopInfo/many-scalar-dependences.ll b/polly/test/ScopInfo/many-scalar-dependences.ll
index aaa02f581a1c..5b003325ef0f 100644
--- a/polly/test/ScopInfo/many-scalar-dependences.ll
+++ b/polly/test/ScopInfo/many-scalar-dependences.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(float a[100][100]) {
 ;      float x;
diff --git a/polly/test/ScopInfo/max-loop-depth.ll b/polly/test/ScopInfo/max-loop-depth.ll
index 3c7db4458604..71e9c02aa8dc 100644
--- a/polly/test/ScopInfo/max-loop-depth.ll
+++ b/polly/test/ScopInfo/max-loop-depth.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void bar();
 ;    void foo(int *A, int *B, long int N, long int M) {
diff --git a/polly/test/ScopInfo/memcpy-raw-source.ll b/polly/test/ScopInfo/memcpy-raw-source.ll
index 137ab8229220..d9024cd27346 100644
--- a/polly/test/ScopInfo/memcpy-raw-source.ll
+++ b/polly/test/ScopInfo/memcpy-raw-source.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -scoped-noalias-aa -tbaa -polly-print-scops -disable-output < %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa,scoped-noalias-aa,tbaa '-passes=print<polly-function-scops>' -disable-output < %s
 ;
 ; Ensure that ScopInfo's alias analysis llvm.memcpy for,
 ; like the AliasSetTracker, preserves bitcasts.
diff --git a/polly/test/ScopInfo/memcpy.ll b/polly/test/ScopInfo/memcpy.ll
index 705dea769e42..95c455f097b2 100644
--- a/polly/test/ScopInfo/memcpy.ll
+++ b/polly/test/ScopInfo/memcpy.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-allow-differing-element-types -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -S -basic-aa -polly-allow-differing-element-types -polly-codegen < %s | FileCheck --check-prefix=IR %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -aa-pipeline=basic-aa -polly-allow-differing-element-types -passes=polly-codegen < %s 2>&1 | FileCheck --check-prefix=IR %s
 ;
 ; CHECK:         Arrays {
 ; CHECK-NEXT:        i8 MemRef_A[*]; // Element size 1
diff --git a/polly/test/ScopInfo/memmove.ll b/polly/test/ScopInfo/memmove.ll
index 15123422f419..8ff471a11cd1 100644
--- a/polly/test/ScopInfo/memmove.ll
+++ b/polly/test/ScopInfo/memmove.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-allow-differing-element-types -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -S -basic-aa -polly-allow-differing-element-types -polly-codegen < %s | FileCheck --check-prefix=IR %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -aa-pipeline=basic-aa -polly-allow-differing-element-types -passes=polly-codegen < %s 2>&1 | FileCheck --check-prefix=IR %s
 ;
 ; CHECK:         Arrays {
 ; CHECK-NEXT:        i8 MemRef_A[*]; // Element size 1
diff --git a/polly/test/ScopInfo/memset.ll b/polly/test/ScopInfo/memset.ll
index ef86b4c275e5..89b048772821 100644
--- a/polly/test/ScopInfo/memset.ll
+++ b/polly/test/ScopInfo/memset.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-allow-differing-element-types -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -S -polly-allow-differing-element-types -polly-codegen < %s | FileCheck --check-prefix=IR %s
+; RUN: opt %loadNPMPolly -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -S -polly-allow-differing-element-types -passes=polly-codegen < %s 2>&1 | FileCheck --check-prefix=IR %s
 ;
 ; CHECK:         Arrays {
 ; CHECK-NEXT:        i8 MemRef_A[*]; // Element size 1
diff --git a/polly/test/ScopInfo/memset_null.ll b/polly/test/ScopInfo/memset_null.ll
index 1608ff6ebef4..9755cf1129e6 100644
--- a/polly/test/ScopInfo/memset_null.ll
+++ b/polly/test/ScopInfo/memset_null.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-allow-modref-calls -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-allow-modref-calls -S -polly-codegen < %s
+; RUN: opt %loadNPMPolly -polly-allow-modref-calls '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-modref-calls -S -passes=polly-codegen < %s
 ;
 ; Verify we can handle a memset to "null" and that we do not model it.
 ; TODO: FIXME: We could use the undefined memset to optimize the code further,
diff --git a/polly/test/ScopInfo/mismatching-array-dimensions.ll b/polly/test/ScopInfo/mismatching-array-dimensions.ll
index a1c6d4e82127..ed1e28cbee6e 100644
--- a/polly/test/ScopInfo/mismatching-array-dimensions.ll
+++ b/polly/test/ScopInfo/mismatching-array-dimensions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK-NOT: AssumedContext
 
diff --git a/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll b/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll
index 72889324e37e..6bc5f8d8eb73 100644
--- a/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll
+++ b/polly/test/ScopInfo/mod_ref_access_pointee_arguments.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-scops -polly-allow-modref-calls \
-; RUN:     -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-codegen -polly-allow-modref-calls \
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb -passes=polly-codegen -polly-allow-modref-calls \
 ; RUN:     -disable-output < %s
 ;
 ; Verify that we model the may-write access of the prefetch intrinsic
diff --git a/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll b/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll
index 2f6c6792fd9d..21322bc648f8 100644
--- a/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll
+++ b/polly/test/ScopInfo/mod_ref_read_pointee_arguments.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-scops -polly-allow-modref-calls \
-; RUN:     -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -basic-aa -polly-codegen -disable-output \
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -disable-output \
 ; RUN:     -polly-allow-modref-calls < %s
 ;
 ; Verify that we model the read access of the gcread intrinsic
diff --git a/polly/test/ScopInfo/mod_ref_read_pointer.ll b/polly/test/ScopInfo/mod_ref_read_pointer.ll
index 657e37c68a7b..25e56a08a961 100644
--- a/polly/test/ScopInfo/mod_ref_read_pointer.ll
+++ b/polly/test/ScopInfo/mod_ref_read_pointer.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-allow-modref-calls -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -basic-aa -polly-allow-modref-calls -polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-allow-modref-calls -passes=polly-codegen -disable-output < %s
 ;
 ; Check that we assume the call to func has a read on the whole A array.
 ;
diff --git a/polly/test/ScopInfo/mod_ref_read_pointers.ll b/polly/test/ScopInfo/mod_ref_read_pointers.ll
index 7ed3423a2aeb..5cc96cf3a06e 100644
--- a/polly/test/ScopInfo/mod_ref_read_pointers.ll
+++ b/polly/test/ScopInfo/mod_ref_read_pointers.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops -polly-allow-modref-calls \
-; RUN:     -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -basic-aa -polly-codegen -disable-output \
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -passes=polly-codegen -disable-output \
 ; RUN:     -polly-allow-modref-calls < %s
 ;
 ; Check that the call to func will "read" not only the A array but also the
diff --git a/polly/test/ScopInfo/modulo_zext_1.ll b/polly/test/ScopInfo/modulo_zext_1.ll
index d611ec4807b5..0a8957da4931 100644
--- a/polly/test/ScopInfo/modulo_zext_1.ll
+++ b/polly/test/ScopInfo/modulo_zext_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Assumed Context:
 ; CHECK-NEXT:    [N] -> {  :  }
diff --git a/polly/test/ScopInfo/modulo_zext_2.ll b/polly/test/ScopInfo/modulo_zext_2.ll
index 8d2321849174..7af2411e7e8c 100644
--- a/polly/test/ScopInfo/modulo_zext_2.ll
+++ b/polly/test/ScopInfo/modulo_zext_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Assumed Context:
 ; CHECK-NEXT:    [N] -> {  :  }
diff --git a/polly/test/ScopInfo/modulo_zext_3.ll b/polly/test/ScopInfo/modulo_zext_3.ll
index acb26dc1c77f..1dac723aa2c2 100644
--- a/polly/test/ScopInfo/modulo_zext_3.ll
+++ b/polly/test/ScopInfo/modulo_zext_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Assumed Context:
 ; CHECK-NEXT:    [N] -> {  :  }
diff --git a/polly/test/ScopInfo/multi-scop.ll b/polly/test/ScopInfo/multi-scop.ll
index e26c8c7bae10..c6dc1f201efa 100644
--- a/polly/test/ScopInfo/multi-scop.ll
+++ b/polly/test/ScopInfo/multi-scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-detect -polly-scops  -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>'  -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; This test case contains two scops.
diff --git a/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll b/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll
index 278c06a2fdba..bd46532d87f1 100644
--- a/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll
+++ b/polly/test/ScopInfo/multidim_2d-diagonal-matrix.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll b/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll
index 06a76466c25e..cdd46304c932 100644
--- a/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll
+++ b/polly/test/ScopInfo/multidim_2d_outer_parametric_offset.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll b/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll
index bfbe5682d44a..0b735b910618 100644
--- a/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll
+++ b/polly/test/ScopInfo/multidim_2d_parametric_array_static_loop_bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/ScopInfo/multidim_2d_with_modref_call.ll b/polly/test/ScopInfo/multidim_2d_with_modref_call.ll
index ba934adb675a..befca87972c1 100644
--- a/polly/test/ScopInfo/multidim_2d_with_modref_call.ll
+++ b/polly/test/ScopInfo/multidim_2d_with_modref_call.ll
@@ -1,9 +1,9 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -polly-allow-modref-calls \
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
 ; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -polly-allow-nonaffine \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-nonaffine \
 ; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -polly-allow-modref-calls -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
+; RUN:     -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
 
 ;  TODO: We should delinearize the accesses despite the use in a call to a
 ;        readonly function. For now we verify we do not delinearize them though.
diff --git a/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll b/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll
index 3da123fd1f60..cceb5353d74c 100644
--- a/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll
+++ b/polly/test/ScopInfo/multidim_2d_with_modref_call_2.ll
@@ -1,9 +1,9 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -polly-allow-modref-calls \
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
 ; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -polly-allow-nonaffine \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-nonaffine \
 ; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -polly-allow-modref-calls -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
+; RUN:     -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
 
 ;  TODO: We should delinearize the accesses despite the use in a call to a
 ;        readonly function. For now we verify we do not delinearize them though.
diff --git a/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll b/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll
index 988475575fec..c957dd10ed65 100644
--- a/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll
+++ b/polly/test/ScopInfo/multidim_3d_parametric_array_static_loop_bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
diff --git a/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll b/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll
index ddc35a46a633..4a1ee3b1af51 100644
--- a/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll
+++ b/polly/test/ScopInfo/multidim_fixedsize_different_dimensionality.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    #define N 400
 ;
diff --git a/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll b/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll
index 9c749f0c48c8..9a6d8fbe1275 100644
--- a/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll
+++ b/polly/test/ScopInfo/multidim_fixedsize_multi_offset.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Context:
 ; CHECK-NEXT: {  :  }
diff --git a/polly/test/ScopInfo/multidim_fold_constant_dim.ll b/polly/test/ScopInfo/multidim_fold_constant_dim.ll
index e95d400a860c..9f4769402286 100644
--- a/polly/test/ScopInfo/multidim_fold_constant_dim.ll
+++ b/polly/test/ScopInfo/multidim_fold_constant_dim.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    struct com {
 ;      double Real;
diff --git a/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll b/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll
index 57275e4024ab..5778126ad8f1 100644
--- a/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll
+++ b/polly/test/ScopInfo/multidim_fold_constant_dim_zero.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -debug -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -debug -disable-output < %s 2>&1 | FileCheck %s
 
 ; REQUIRES: asserts
 
diff --git a/polly/test/ScopInfo/multidim_fortran_2d.ll b/polly/test/ScopInfo/multidim_fortran_2d.ll
index 29279a4e886b..e5b005f17dcc 100644
--- a/polly/test/ScopInfo/multidim_fortran_2d.ll
+++ b/polly/test/ScopInfo/multidim_fortran_2d.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops \
-; RUN: -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' \
+; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 
 ;   subroutine init_array(ni, nj, pi, pj, a)
 ;   implicit none
diff --git a/polly/test/ScopInfo/multidim_fortran_2d_params.ll b/polly/test/ScopInfo/multidim_fortran_2d_params.ll
index 93145b399ca5..a7f7ebc13036 100644
--- a/polly/test/ScopInfo/multidim_fortran_2d_params.ll
+++ b/polly/test/ScopInfo/multidim_fortran_2d_params.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
 ; RUN: -polly-precise-fold-accesses \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 
 ;   subroutine init_array(ni, nj, pi, pj, a)
 ;   implicit none
diff --git a/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll b/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll
index dff6a8be85cf..5f3080a12fdb 100644
--- a/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll
+++ b/polly/test/ScopInfo/multidim_fortran_2d_with_modref_call.ll
@@ -1,9 +1,9 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -polly-allow-modref-calls \
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-modref-calls \
 ; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -polly-allow-nonaffine \
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-allow-nonaffine \
 ; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -polly-allow-modref-calls -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
+; RUN:     -polly-allow-modref-calls -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
 
 ;  TODO: We should delinearize the accesses despite the use in a call to a
 ;        readonly function. For now we verify we do not delinearize them though.
diff --git a/polly/test/ScopInfo/multidim_fortran_srem.ll b/polly/test/ScopInfo/multidim_fortran_srem.ll
index 8c24c5b8ee71..31cc633fa65c 100644
--- a/polly/test/ScopInfo/multidim_fortran_srem.ll
+++ b/polly/test/ScopInfo/multidim_fortran_srem.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 ; CHECK:      Statements {
diff --git a/polly/test/ScopInfo/multidim_gep_pointercast.ll b/polly/test/ScopInfo/multidim_gep_pointercast.ll
index 20d59fa91eaf..fd8048b11f14 100644
--- a/polly/test/ScopInfo/multidim_gep_pointercast.ll
+++ b/polly/test/ScopInfo/multidim_gep_pointercast.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The load access to A has a pointer-bitcast to another elements size before the
 ; GetElementPtr. Verify that we do not the GEP delinearization because it
diff --git a/polly/test/ScopInfo/multidim_gep_pointercast2.ll b/polly/test/ScopInfo/multidim_gep_pointercast2.ll
index deed9c7c3f57..b31a0d0262db 100644
--- a/polly/test/ScopInfo/multidim_gep_pointercast2.ll
+++ b/polly/test/ScopInfo/multidim_gep_pointercast2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verfy that we do not use the GetElementPtr information to delinearize A
 ; because of the cast in-between. Use the single-dimensional modeling instead.
diff --git a/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll b/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll
index 9f7e6bc4a2a2..92b42a9e7a87 100644
--- a/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll
+++ b/polly/test/ScopInfo/multidim_ivs_and_integer_offsets_3d.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
diff --git a/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll b/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll
index 131bb7b3ebed..261cba1e68aa 100644
--- a/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll
+++ b/polly/test/ScopInfo/multidim_ivs_and_parameteric_offsets_3d.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-precise-fold-accesses -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-precise-fold-accesses '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o], long p, long q, long r) {
diff --git a/polly/test/ScopInfo/multidim_many_references.ll b/polly/test/ScopInfo/multidim_many_references.ll
index b0483b267260..3801fda4923c 100644
--- a/polly/test/ScopInfo/multidim_many_references.ll
+++ b/polly/test/ScopInfo/multidim_many_references.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-ignore-aliasing -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -polly-ignore-aliasing -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-ignore-aliasing -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/multidim_nested_start_integer.ll b/polly/test/ScopInfo/multidim_nested_start_integer.ll
index 741a0ef45c27..6ee9798a050d 100644
--- a/polly/test/ScopInfo/multidim_nested_start_integer.ll
+++ b/polly/test/ScopInfo/multidim_nested_start_integer.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
diff --git a/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll b/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll
index 692746bad3d7..e238bddf4783 100644
--- a/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll
+++ b/polly/test/ScopInfo/multidim_nested_start_share_parameter.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
diff --git a/polly/test/ScopInfo/multidim_only_ivs_2d.ll b/polly/test/ScopInfo/multidim_only_ivs_2d.ll
index 71245642e751..33b321716edc 100644
--- a/polly/test/ScopInfo/multidim_only_ivs_2d.ll
+++ b/polly/test/ScopInfo/multidim_only_ivs_2d.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Derived from the following code:
diff --git a/polly/test/ScopInfo/multidim_only_ivs_3d.ll b/polly/test/ScopInfo/multidim_only_ivs_3d.ll
index a019d58b241d..39ea4243d942 100644
--- a/polly/test/ScopInfo/multidim_only_ivs_3d.ll
+++ b/polly/test/ScopInfo/multidim_only_ivs_3d.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long n, long m, long o, double A[n][m][o]) {
diff --git a/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll b/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll
index 41577ef1a0be..7f7f7f91067e 100644
--- a/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll
+++ b/polly/test/ScopInfo/multidim_only_ivs_3d_cast.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void foo(int n, int m, int o, double A[n][m][o]) {
 ;
diff --git a/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll b/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll
index 25907f2ee79c..1675110ffd6f 100644
--- a/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll
+++ b/polly/test/ScopInfo/multidim_only_ivs_3d_reverse.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; This test case checks for array access functions where the order in which the
diff --git a/polly/test/ScopInfo/multidim_param_in_subscript-2.ll b/polly/test/ScopInfo/multidim_param_in_subscript-2.ll
index 0790664f7129..da9827fd5f2c 100644
--- a/polly/test/ScopInfo/multidim_param_in_subscript-2.ll
+++ b/polly/test/ScopInfo/multidim_param_in_subscript-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-precise-fold-accesses -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-precise-fold-accesses '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(long n, long m, float A[][n][m]) {
 ;      for (long i = 0; i < 100; i++)
diff --git a/polly/test/ScopInfo/multidim_param_in_subscript.ll b/polly/test/ScopInfo/multidim_param_in_subscript.ll
index b8ec80b321fe..c86b5f0ae238 100644
--- a/polly/test/ScopInfo/multidim_param_in_subscript.ll
+++ b/polly/test/ScopInfo/multidim_param_in_subscript.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;
 ;    void foo(long n, float A[][n]) {
diff --git a/polly/test/ScopInfo/multidim_parameter_addrec_product.ll b/polly/test/ScopInfo/multidim_parameter_addrec_product.ll
index 7db3e9dc3b5f..da563a05560c 100644
--- a/polly/test/ScopInfo/multidim_parameter_addrec_product.ll
+++ b/polly/test/ScopInfo/multidim_parameter_addrec_product.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting=true -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(float *A, long *p) {
 ;      for (long i = 0; i < 100; i++)
diff --git a/polly/test/ScopInfo/multidim_single_and_multidim_array.ll b/polly/test/ScopInfo/multidim_single_and_multidim_array.ll
index 1e302dec4861..7059e5396987 100644
--- a/polly/test/ScopInfo/multidim_single_and_multidim_array.ll
+++ b/polly/test/ScopInfo/multidim_single_and_multidim_array.ll
@@ -1,11 +1,11 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-delinearize=false -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-scops -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=DELIN
-; RUN: opt %loadPolly -polly-print-scops -polly-allow-nonaffine -disable-output < %s | FileCheck %s --check-prefix=DELIN
-; RUN: opt %loadPolly -polly-print-function-scops -polly-delinearize=false -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s | FileCheck %s --check-prefix=NONAFFINE
-; RUN: opt %loadPolly -polly-print-function-scops -disable-output < %s | FileCheck %s --check-prefix=DELIN
-; RUN: opt %loadPolly -polly-print-function-scops -polly-allow-nonaffine -disable-output < %s | FileCheck %s --check-prefix=DELIN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-delinearize=false -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s --check-prefix=DELIN
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/multidim_srem.ll b/polly/test/ScopInfo/multidim_srem.ll
index f89843f0a5bc..c965e2c86e2b 100644
--- a/polly/test/ScopInfo/multidim_srem.ll
+++ b/polly/test/ScopInfo/multidim_srem.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void foo(long n, float A[][n][n]) {
 ;      for (long i = 0; i < 200; i++)
diff --git a/polly/test/ScopInfo/multidim_with_bitcast.ll b/polly/test/ScopInfo/multidim_with_bitcast.ll
index b77ff689b953..0ab9c2d93ff4 100644
--- a/polly/test/ScopInfo/multidim_with_bitcast.ll
+++ b/polly/test/ScopInfo/multidim_with_bitcast.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/multiple-binary-or-conditions.ll b/polly/test/ScopInfo/multiple-binary-or-conditions.ll
index b905a11f577c..65416e6fffda 100644
--- a/polly/test/ScopInfo/multiple-binary-or-conditions.ll
+++ b/polly/test/ScopInfo/multiple-binary-or-conditions.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -disable-output < %s
 ;
 ; void or(float *A, long n, long m) {
 ;   for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll b/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll
index 2d03ad941c05..910e624adb50 100644
--- a/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll
+++ b/polly/test/ScopInfo/multiple-types-access-offset-not-dividable-by-element-size.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -pass-remarks-analysis="polly-scops" \
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \
 ; RUN:     -polly-allow-differing-element-types \
 ; RUN:     -disable-output < %s  2>&1 | FileCheck %s
 ;
diff --git a/polly/test/ScopInfo/multiple-types-non-affine-2.ll b/polly/test/ScopInfo/multiple-types-non-affine-2.ll
index 5b0aa5de1e71..cb0630da1b2e 100644
--- a/polly/test/ScopInfo/multiple-types-non-affine-2.ll
+++ b/polly/test/ScopInfo/multiple-types-non-affine-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types -polly-print-scops -polly-allow-nonaffine -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types -polly-codegen -polly-allow-nonaffine -disable-output
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types -passes=polly-codegen -polly-allow-nonaffine -disable-output
 ;
 ;    // Check that accessing one array with different types works,
 ;    // even though some accesses are non-affine.
diff --git a/polly/test/ScopInfo/multiple-types-non-affine.ll b/polly/test/ScopInfo/multiple-types-non-affine.ll
index 8e4be4c86d5a..7349c5ae48ba 100644
--- a/polly/test/ScopInfo/multiple-types-non-affine.ll
+++ b/polly/test/ScopInfo/multiple-types-non-affine.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types -polly-print-scops -polly-allow-nonaffine -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types -polly-codegen -polly-allow-nonaffine -disable-output
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types '-passes=print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-differing-element-types -passes=polly-codegen -polly-allow-nonaffine -disable-output
 ;
 ;    // Check that accessing one array with different types works,
 ;    // even though some accesses are non-affine.
diff --git a/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll b/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll
index 01f5923457b4..df280c88f866 100644
--- a/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll
+++ b/polly/test/ScopInfo/multiple-types-non-power-of-two-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-allow-differing-element-types -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;  void multiple_types(i8 *A) {
 ;    for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/multiple-types-non-power-of-two.ll b/polly/test/ScopInfo/multiple-types-non-power-of-two.ll
index 142a5ac395b3..b9494187d0ff 100644
--- a/polly/test/ScopInfo/multiple-types-non-power-of-two.ll
+++ b/polly/test/ScopInfo/multiple-types-non-power-of-two.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-allow-differing-element-types -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;  void multiple_types(i8 *A) {
 ;    for (long i = 0; i < 100; i++) {
diff --git a/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll b/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll
index 1e2e53e85c25..e971ccc0ba44 100644
--- a/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll
+++ b/polly/test/ScopInfo/multiple-types-two-dimensional-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -pass-remarks-analysis="polly-scops" \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \
 ; RUN:                -polly-allow-differing-element-types \
 ; RUN:                -disable-output < %s 2>&1 | FileCheck %s
 ;
diff --git a/polly/test/ScopInfo/multiple-types-two-dimensional.ll b/polly/test/ScopInfo/multiple-types-two-dimensional.ll
index 21dc96e6f95d..34179508cae8 100644
--- a/polly/test/ScopInfo/multiple-types-two-dimensional.ll
+++ b/polly/test/ScopInfo/multiple-types-two-dimensional.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -pass-remarks-analysis="polly-scops" \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \
 ; RUN:     -polly-allow-differing-element-types \
 ; RUN:     -disable-output < %s  2>&1 | FileCheck %s
 ;
diff --git a/polly/test/ScopInfo/multiple-types.ll b/polly/test/ScopInfo/multiple-types.ll
index 16db191c522f..84d7d3349e29 100644
--- a/polly/test/ScopInfo/multiple-types.ll
+++ b/polly/test/ScopInfo/multiple-types.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops \
-; RUN: -polly-allow-differing-element-types -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' \
+; RUN: -polly-allow-differing-element-types -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    // Check that accessing one array with different types works.
 ;    void multiple_types(char *Short, char *Float, char *Double) {
diff --git a/polly/test/ScopInfo/multiple_exiting_blocks.ll b/polly/test/ScopInfo/multiple_exiting_blocks.ll
index f8e5d4106a16..b0c425ee62cc 100644
--- a/polly/test/ScopInfo/multiple_exiting_blocks.ll
+++ b/polly/test/ScopInfo/multiple_exiting_blocks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll b/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll
index c695f3c913db..ff0ec47be1c5 100644
--- a/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll
+++ b/polly/test/ScopInfo/multiple_exiting_blocks_two_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/multiple_latch_blocks.ll b/polly/test/ScopInfo/multiple_latch_blocks.ll
index d3949e7e2c3c..e5085daa2ca1 100644
--- a/polly/test/ScopInfo/multiple_latch_blocks.ll
+++ b/polly/test/ScopInfo/multiple_latch_blocks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Domain :=
 ; CHECK:   [N, P] -> { Stmt_if_end[i0] : 0 <= i0 < N and (i0 > P or i0 < P) };
diff --git a/polly/test/ScopInfo/nested-loops.ll b/polly/test/ScopInfo/nested-loops.ll
index ed814f826829..91002979f4fa 100644
--- a/polly/test/ScopInfo/nested-loops.ll
+++ b/polly/test/ScopInfo/nested-loops.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
diff --git a/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll b/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll
index 7c55e242641c..df010846bed2 100644
--- a/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll
+++ b/polly/test/ScopInfo/no-scalar-deps-in-non-affine-subregion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not generate any scalar dependences regarding x. It is
 ; defined and used on the non-affine subregion only, thus we do not need
diff --git a/polly/test/ScopInfo/non-affine-region-phi.ll b/polly/test/ScopInfo/non-affine-region-phi.ll
index f99782b9a0ff..3fb655e60f1c 100644
--- a/polly/test/ScopInfo/non-affine-region-phi.ll
+++ b/polly/test/ScopInfo/non-affine-region-phi.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine -S < %s | FileCheck %s --check-prefix=CODE
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -S < %s 2>&1 | FileCheck %s --check-prefix=CODE
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify there is a phi in the non-affine region but it is not represented in
 ; the SCoP as all operands as well as the uses are inside the region too.
diff --git a/polly/test/ScopInfo/non-affine-region-with-loop-2.ll b/polly/test/ScopInfo/non-affine-region-with-loop-2.ll
index b673fda5ec3c..4c3ca4d21447 100644
--- a/polly/test/ScopInfo/non-affine-region-with-loop-2.ll
+++ b/polly/test/ScopInfo/non-affine-region-with-loop-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-allow-nonaffine-loops -polly-print-scops -polly-codegen -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-allow-nonaffine-loops '-passes=print<polly-detect>,print<polly-function-scops>,scop(polly-codegen)' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Stmt_loop3
 ; CHECK:            Domain :=
diff --git a/polly/test/ScopInfo/non-affine-region-with-loop.ll b/polly/test/ScopInfo/non-affine-region-with-loop.ll
index 32dde8b4a682..f4c028ac2340 100644
--- a/polly/test/ScopInfo/non-affine-region-with-loop.ll
+++ b/polly/test/ScopInfo/non-affine-region-with-loop.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-loops -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-allow-nonaffine-loops -polly-codegen -disable-output
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops -passes=polly-codegen -disable-output
 ;
 ; CHECK:      Domain :=
 ; CHECK-NEXT:   { Stmt_loop2__TO__loop[] };
diff --git a/polly/test/ScopInfo/non-precise-inv-load-1.ll b/polly/test/ScopInfo/non-precise-inv-load-1.ll
index 5394206dd547..d55344b355f1 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-1.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we do hoist the invariant access to I with a execution context
 ; as the address computation might wrap in the original but not in our
diff --git a/polly/test/ScopInfo/non-precise-inv-load-2.ll b/polly/test/ScopInfo/non-precise-inv-load-2.ll
index 5c0c56513a08..79ef3b88cb4f 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-2.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;
 ; CHECK:       Invariant Accesses: {
diff --git a/polly/test/ScopInfo/non-precise-inv-load-3.ll b/polly/test/ScopInfo/non-precise-inv-load-3.ll
index 09d09319656b..aa9284766116 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-3.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:        Invariant Accesses: {
 ; CHECK-NEXT:     ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/non-precise-inv-load-4.ll b/polly/test/ScopInfo/non-precise-inv-load-4.ll
index da5f656576d1..2a2241cb5a99 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-4.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we hoist I[0] without execution context even though it
 ; is executed in a statement with an invalid domain.
diff --git a/polly/test/ScopInfo/non-precise-inv-load-5.ll b/polly/test/ScopInfo/non-precise-inv-load-5.ll
index bff5f59a3302..a414c7c0fed1 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-5.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we do not hoist I[c] without execution context because it
 ; is executed in a statement with an invalid domain and it depends
diff --git a/polly/test/ScopInfo/non-precise-inv-load-6.ll b/polly/test/ScopInfo/non-precise-inv-load-6.ll
index 03540a8ead96..1300617f00ee 100644
--- a/polly/test/ScopInfo/non-precise-inv-load-6.ll
+++ b/polly/test/ScopInfo/non-precise-inv-load-6.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we model the execution context correctly.
 ;
diff --git a/polly/test/ScopInfo/non-pure-function-call.ll b/polly/test/ScopInfo/non-pure-function-call.ll
index 4ffb8d28865d..81d43db5c352 100644
--- a/polly/test/ScopInfo/non-pure-function-call.ll
+++ b/polly/test/ScopInfo/non-pure-function-call.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Assumed Context:
 ; CHECK-NEXT:   [N] -> {  :  }
diff --git a/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll b/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll
index 27998b50b74f..6cbb41041be8 100644
--- a/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll
+++ b/polly/test/ScopInfo/non-pure-function-calls-causes-dead-blocks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Error blocks are skipped during SCoP detection. We skip them during
 ; SCoP formation too as they might contain instructions we can not handle.
diff --git a/polly/test/ScopInfo/non-pure-function-calls.ll b/polly/test/ScopInfo/non-pure-function-calls.ll
index 3ecf75853773..f97644052272 100644
--- a/polly/test/ScopInfo/non-pure-function-calls.ll
+++ b/polly/test/ScopInfo/non-pure-function-calls.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Allow the user to define function names that are treated as
 ; error functions and assumed not to be executed.
diff --git a/polly/test/ScopInfo/non_affine_access.ll b/polly/test/ScopInfo/non_affine_access.ll
index a83c9484ad52..0338edf05329 100644
--- a/polly/test/ScopInfo/non_affine_access.ll
+++ b/polly/test/ScopInfo/non_affine_access.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-scops -polly-allow-nonaffine -disable-output < %s | FileCheck %s -check-prefix=NONAFFINE
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -polly-allow-nonaffine -disable-output < %s 2>&1 | FileCheck %s -check-prefix=NONAFFINE
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; void foo(long *A) {
diff --git a/polly/test/ScopInfo/non_affine_region_1.ll b/polly/test/ScopInfo/non_affine_region_1.ll
index 7c4312599cf0..8980a711b325 100644
--- a/polly/test/ScopInfo/non_affine_region_1.ll
+++ b/polly/test/ScopInfo/non_affine_region_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify only the incoming scalar x is modeled as a read in the non-affine
 ; region.
diff --git a/polly/test/ScopInfo/non_affine_region_2.ll b/polly/test/ScopInfo/non_affine_region_2.ll
index 0bc467c92bcb..b2e072f7a3bf 100644
--- a/polly/test/ScopInfo/non_affine_region_2.ll
+++ b/polly/test/ScopInfo/non_affine_region_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify the scalar x defined in a non-affine subregion is written as it
 ; escapes the region. In this test the two conditionals inside the region
diff --git a/polly/test/ScopInfo/non_affine_region_3.ll b/polly/test/ScopInfo/non_affine_region_3.ll
index 6d5f94df6110..d850cb5c95aa 100644
--- a/polly/test/ScopInfo/non_affine_region_3.ll
+++ b/polly/test/ScopInfo/non_affine_region_3.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify the scalar x defined in a non-affine subregion is written as it
 ; escapes the region. In this test the two conditionals inside the region
diff --git a/polly/test/ScopInfo/non_affine_region_4.ll b/polly/test/ScopInfo/non_affine_region_4.ll
index f37e0ecb89d1..c5309734a668 100644
--- a/polly/test/ScopInfo/non_affine_region_4.ll
+++ b/polly/test/ScopInfo/non_affine_region_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that both scalars (x and y) are properly written in the non-affine
 ; region and read afterwards.
diff --git a/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll b/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll
index 445dd164898b..b1ce00f0df94 100644
--- a/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll
+++ b/polly/test/ScopInfo/nonaffine-buildMemoryAccess.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine-loops -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine-loops '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Domain :=
 ; CHECK-NEXT:   { Stmt_while_cond_i__TO__while_end_i[] };
diff --git a/polly/test/ScopInfo/not-a-reduction.ll b/polly/test/ScopInfo/not-a-reduction.ll
index 87909290fd71..3a961b2dc171 100644
--- a/polly/test/ScopInfo/not-a-reduction.ll
+++ b/polly/test/ScopInfo/not-a-reduction.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s 2>&1 | not FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | not FileCheck %s
 
 ;#define TYPE float
 ;#define NUM 4
diff --git a/polly/test/ScopInfo/opaque-struct.ll b/polly/test/ScopInfo/opaque-struct.ll
index 19fdd9bf9179..f4f79525069e 100644
--- a/polly/test/ScopInfo/opaque-struct.ll
+++ b/polly/test/ScopInfo/opaque-struct.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-scops -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s
 ;
 ; Check that we do not crash with unsized (opaque) types.
 ;
diff --git a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll
index 394173bdc986..eed27b1c4d9d 100644
--- a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll
+++ b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node-nonaffine-subregion.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-codegen -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S < %s 2>&1 | FileCheck %s
 ;
 ; Check whether %newval is identified as escaping value, even though it is used
 ; in a phi that is in the region. Non-affine subregion case.
diff --git a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll
index e17164e89372..44da399e704d 100644
--- a/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll
+++ b/polly/test/ScopInfo/out-of-scop-use-in-region-entry-phi-node.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: MustWriteAccess :=  [Reduction Type: NONE] [Scalar: 1]
 ; CHECK-NEXT: [p_0] -> { Stmt_bb3[] -> MemRef_tmp5[] };
diff --git a/polly/test/ScopInfo/parameter-constant-division.ll b/polly/test/ScopInfo/parameter-constant-division.ll
index cd6b9e3526aa..e5dd359158b8 100644
--- a/polly/test/ScopInfo/parameter-constant-division.ll
+++ b/polly/test/ScopInfo/parameter-constant-division.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops \
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' \
 ; RUN:     -polly-invariant-load-hoisting=true \
-; RUN:     -disable-output < %s | FileCheck %s
+; RUN:     -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:          Invariant Accesses: {
 ; CHECK-NEXT:            ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/parameter_in_dead_statement.ll b/polly/test/ScopInfo/parameter_in_dead_statement.ll
index 4b4a87f098d7..b295f17f628a 100644
--- a/polly/test/ScopInfo/parameter_in_dead_statement.ll
+++ b/polly/test/ScopInfo/parameter_in_dead_statement.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -S \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -S \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s --check-prefix=IR
 ;
 ; Verify we do not create assumptions based on the parameter p_1 which is the
 ; load %0 and due to error-assumptions not "part of the SCoP".
diff --git a/polly/test/ScopInfo/parameter_product.ll b/polly/test/ScopInfo/parameter_product.ll
index 1ba7280f97c9..2fe16f9d95f6 100644
--- a/polly/test/ScopInfo/parameter_product.ll
+++ b/polly/test/ScopInfo/parameter_product.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; int n, m;
 ; void foo(char* __restrict a)
diff --git a/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll b/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll
index 72d580801573..6544aaec76f7 100644
--- a/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll
+++ b/polly/test/ScopInfo/parameter_with_constant_factor_in_add.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the access function of the store is simple and concise
 ;
diff --git a/polly/test/ScopInfo/partially_invariant_load_1.ll b/polly/test/ScopInfo/partially_invariant_load_1.ll
index 274a7873c782..f3923f6127cd 100644
--- a/polly/test/ScopInfo/partially_invariant_load_1.ll
+++ b/polly/test/ScopInfo/partially_invariant_load_1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-codegen -polly-invariant-load-hoisting=true -S < %s | FileCheck %s --check-prefix=IR
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -passes=polly-codegen -polly-invariant-load-hoisting=true -S < %s 2>&1 | FileCheck %s --check-prefix=IR
 ;
 ; CHECK:          Invariant Accesses: {
 ; CHECK-NEXT:             ReadAccess :=	[Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/partially_invariant_load_2.ll b/polly/test/ScopInfo/partially_invariant_load_2.ll
index ee1092883f72..d0d74ad99e09 100644
--- a/polly/test/ScopInfo/partially_invariant_load_2.ll
+++ b/polly/test/ScopInfo/partially_invariant_load_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-invariant-load-hoisting=true -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not try to preload *I and assume p != 42.
 ;
diff --git a/polly/test/ScopInfo/phi-in-non-affine-region.ll b/polly/test/ScopInfo/phi-in-non-affine-region.ll
index 6ef24e3f1456..fbbc158b566b 100644
--- a/polly/test/ScopInfo/phi-in-non-affine-region.ll
+++ b/polly/test/ScopInfo/phi-in-non-affine-region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Verify that 'tmp' is stored in bb1 and read by bb3, as it is needed as
 ; incoming value for the tmp11 PHI node.
diff --git a/polly/test/ScopInfo/phi_after_error_block.ll b/polly/test/ScopInfo/phi_after_error_block.ll
index 039fb86bec5b..a1eadff3e971 100644
--- a/polly/test/ScopInfo/phi_after_error_block.ll
+++ b/polly/test/ScopInfo/phi_after_error_block.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 declare void @bar()
 
diff --git a/polly/test/ScopInfo/phi_condition_modeling_1.ll b/polly/test/ScopInfo/phi_condition_modeling_1.ll
index a879c2005ad8..a889ec96a4b1 100644
--- a/polly/test/ScopInfo/phi_condition_modeling_1.ll
+++ b/polly/test/ScopInfo/phi_condition_modeling_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int c, int N) {
 ;      int tmp;
diff --git a/polly/test/ScopInfo/phi_condition_modeling_2.ll b/polly/test/ScopInfo/phi_condition_modeling_2.ll
index cedc140f8438..b56b77e1f453 100644
--- a/polly/test/ScopInfo/phi_condition_modeling_2.ll
+++ b/polly/test/ScopInfo/phi_condition_modeling_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int c, int N) {
 ;      int tmp;
diff --git a/polly/test/ScopInfo/phi_conditional_simple_1.ll b/polly/test/ScopInfo/phi_conditional_simple_1.ll
index 90213a953767..14fdc38201bc 100644
--- a/polly/test/ScopInfo/phi_conditional_simple_1.ll
+++ b/polly/test/ScopInfo/phi_conditional_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void jd(int *A, int c) {
 ;      for (int i = 0; i < 1024; i++) {
diff --git a/polly/test/ScopInfo/phi_loop_carried_float.ll b/polly/test/ScopInfo/phi_loop_carried_float.ll
index d8d2608329bc..76e5507f24b0 100644
--- a/polly/test/ScopInfo/phi_loop_carried_float.ll
+++ b/polly/test/ScopInfo/phi_loop_carried_float.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    float f(float *A, int N) {
 ;      float tmp = 0;
diff --git a/polly/test/ScopInfo/phi_not_grouped_at_top.ll b/polly/test/ScopInfo/phi_not_grouped_at_top.ll
index be082165b635..c97d9a27b24b 100644
--- a/polly/test/ScopInfo/phi_not_grouped_at_top.ll
+++ b/polly/test/ScopInfo/phi_not_grouped_at_top.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-prepare -disable-output < %s
+; RUN: opt %loadNPMPolly -passes=polly-prepare -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 declare i32 @funa() align 2
diff --git a/polly/test/ScopInfo/phi_scalar_simple_1.ll b/polly/test/ScopInfo/phi_scalar_simple_1.ll
index d042613c023f..ffd1a37f8a79 100644
--- a/polly/test/ScopInfo/phi_scalar_simple_1.ll
+++ b/polly/test/ScopInfo/phi_scalar_simple_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The assumed context should be empty since the <nsw> flags on the IV
 ; increments already guarantee that there is no wrap in the loop trip
diff --git a/polly/test/ScopInfo/phi_scalar_simple_2.ll b/polly/test/ScopInfo/phi_scalar_simple_2.ll
index fb4292e05ca6..0d6d9029c61c 100644
--- a/polly/test/ScopInfo/phi_scalar_simple_2.ll
+++ b/polly/test/ScopInfo/phi_scalar_simple_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    int jd(int *restrict A, int x, int N, int c) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/phi_with_invoke_edge.ll b/polly/test/ScopInfo/phi_with_invoke_edge.ll
index dbcf04c0561a..9c98ec0c603c 100644
--- a/polly/test/ScopInfo/phi_with_invoke_edge.ll
+++ b/polly/test/ScopInfo/phi_with_invoke_edge.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-detect -disable-output < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 declare i32 @generic_personality_v0(i32, i64, ptr, ptr)
diff --git a/polly/test/ScopInfo/pointer-comparison-no-nsw.ll b/polly/test/ScopInfo/pointer-comparison-no-nsw.ll
index 094c5ccab54d..18ba18c69f1f 100644
--- a/polly/test/ScopInfo/pointer-comparison-no-nsw.ll
+++ b/polly/test/ScopInfo/pointer-comparison-no-nsw.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int *B) {
 ;      while (A != B) {
diff --git a/polly/test/ScopInfo/pointer-comparison.ll b/polly/test/ScopInfo/pointer-comparison.ll
index 15ce0491209a..846640ac630f 100644
--- a/polly/test/ScopInfo/pointer-comparison.ll
+++ b/polly/test/ScopInfo/pointer-comparison.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; TODO: FIXME: Investigate why we need a InvalidContext here.
 ;
diff --git a/polly/test/ScopInfo/pointer-type-expressions.ll b/polly/test/ScopInfo/pointer-type-expressions.ll
index ebbb644340f6..89dce6536a10 100644
--- a/polly/test/ScopInfo/pointer-type-expressions.ll
+++ b/polly/test/ScopInfo/pointer-type-expressions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], int N, float *P) {
 ;   int i;
diff --git a/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll b/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll
index 3ac86a3443af..7b6d0d542581 100644
--- a/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll
+++ b/polly/test/ScopInfo/pointer-used-as-base-pointer-and-scalar-read.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; In this test case we pass a pointer %A into a PHI node and also use this
 ; pointer as base pointer of an array store. As a result, we get both scalar
diff --git a/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll b/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll
index 8152010c2c99..13087a517501 100644
--- a/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll
+++ b/polly/test/ScopInfo/polly-timeout-parameter-bounds.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Statements {
 ; CHECK-NEXT:  	Stmt_bb9
diff --git a/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll b/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll
index 4a68acd3d509..33fa0126aa30 100644
--- a/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll
+++ b/polly/test/ScopInfo/preserve-equiv-class-order-in-basic_block.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=scalar-indep -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 
 target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/ScopInfo/process_added_dimensions.ll b/polly/test/ScopInfo/process_added_dimensions.ll
index 6cb270a071f4..2d06f4b99597 100644
--- a/polly/test/ScopInfo/process_added_dimensions.ll
+++ b/polly/test/ScopInfo/process_added_dimensions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Context:
 ; CHECK-NEXT: {  :  }
diff --git a/polly/test/ScopInfo/pwaff-complexity-bailout.ll b/polly/test/ScopInfo/pwaff-complexity-bailout.ll
index 19dd156d27db..931e08fb8f2f 100644
--- a/polly/test/ScopInfo/pwaff-complexity-bailout.ll
+++ b/polly/test/ScopInfo/pwaff-complexity-bailout.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-scops -pass-remarks-analysis=.* -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis=.* -disable-output < %s 2>&1 | FileCheck %s
 
 ; Make sure we hit the complexity bailout, and don't crash.
 ; CHECK: Low complexity assumption:       {  : false }
diff --git a/polly/test/ScopInfo/ranged_parameter.ll b/polly/test/ScopInfo/ranged_parameter.ll
index 4b04960ee845..03562b1fd124 100644
--- a/polly/test/ScopInfo/ranged_parameter.ll
+++ b/polly/test/ScopInfo/ranged_parameter.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the constraints on the parameter derived from the
 ; range metadata (see bottom of the file) are present:
diff --git a/polly/test/ScopInfo/ranged_parameter_2.ll b/polly/test/ScopInfo/ranged_parameter_2.ll
index cd7d2bfb84d0..18cbbf3b87cd 100644
--- a/polly/test/ScopInfo/ranged_parameter_2.ll
+++ b/polly/test/ScopInfo/ranged_parameter_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output -polly-allow-nonaffine -polly-invariant-load-hoisting=true < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-allow-nonaffine -polly-invariant-load-hoisting=true < %s \
 ; RUN:  -debug 2>&1 | FileCheck %s
 
 ; REQUIRES: asserts
diff --git a/polly/test/ScopInfo/ranged_parameter_wrap.ll b/polly/test/ScopInfo/ranged_parameter_wrap.ll
index 173746352cf0..d236eeeefc11 100644
--- a/polly/test/ScopInfo/ranged_parameter_wrap.ll
+++ b/polly/test/ScopInfo/ranged_parameter_wrap.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the constraints on the parameter derived from the
 ; __wrapping__ range metadata (see bottom of the file) are present:
diff --git a/polly/test/ScopInfo/ranged_parameter_wrap_2.ll b/polly/test/ScopInfo/ranged_parameter_wrap_2.ll
index 33f57f37a1e8..fc0a737a5edb 100644
--- a/polly/test/ScopInfo/ranged_parameter_wrap_2.ll
+++ b/polly/test/ScopInfo/ranged_parameter_wrap_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that the context is built fast and does not explode due to us
 ; combining a large number of non-convex ranges. Instead, after a certain
diff --git a/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll b/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll
index 23c7aa261ac0..7e6f2406a0ac 100644
--- a/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll
+++ b/polly/test/ScopInfo/read-only-scalar-used-in-phi-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    float foo(float sum, float A[]) {
 ;
diff --git a/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll b/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll
index 20f44c94251c..18e6c1fac9e1 100644
--- a/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll
+++ b/polly/test/ScopInfo/read-only-scalar-used-in-phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    float foo(float sum, float A[]) {
 ;
diff --git a/polly/test/ScopInfo/read-only-scalars.ll b/polly/test/ScopInfo/read-only-scalars.ll
index 71c2d21e357a..f04163e48028 100644
--- a/polly/test/ScopInfo/read-only-scalars.ll
+++ b/polly/test/ScopInfo/read-only-scalars.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=false -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=true  -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=SCALARS
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=false '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-analyze-read-only-scalars=true  '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=SCALARS
 
 ; CHECK-NOT: Memref_scalar
 
diff --git a/polly/test/ScopInfo/read-only-statements.ll b/polly/test/ScopInfo/read-only-statements.ll
index a93063ea3ad6..7bac53a2b6b5 100644
--- a/polly/test/ScopInfo/read-only-statements.ll
+++ b/polly/test/ScopInfo/read-only-statements.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check we remove read only statements.
 ;
diff --git a/polly/test/ScopInfo/reduction_alternating_base.ll b/polly/test/ScopInfo/reduction_alternating_base.ll
index 854e28023a3e..e38ff6046ac0 100644
--- a/polly/test/ScopInfo/reduction_alternating_base.ll
+++ b/polly/test/ScopInfo/reduction_alternating_base.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;
 ;    void f(int *A) {
diff --git a/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll b/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll
index fb0274972082..17f9dc57f282 100644
--- a/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll
+++ b/polly/test/ScopInfo/reduction_chain_partially_outside_the_scop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: NONE
 ;
diff --git a/polly/test/ScopInfo/reduction_different_index.ll b/polly/test/ScopInfo/reduction_different_index.ll
index 575e5a16d7b2..d2786d5fd677 100644
--- a/polly/test/ScopInfo/reduction_different_index.ll
+++ b/polly/test/ScopInfo/reduction_different_index.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ; Verify if the following case is not detected as reduction.
 ;
 ; void f(int *A,int *sum) {
diff --git a/polly/test/ScopInfo/reduction_different_index1.ll b/polly/test/ScopInfo/reduction_different_index1.ll
index 39bd3c4b9abe..710ae3e74f21 100644
--- a/polly/test/ScopInfo/reduction_different_index1.ll
+++ b/polly/test/ScopInfo/reduction_different_index1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ; Verify if the following case is not detected as reduction.
 ;
 ; void f(int *A, int *sum, int i1, int i2) {
diff --git a/polly/test/ScopInfo/reduction_disabled_multiplicative.ll b/polly/test/ScopInfo/reduction_disabled_multiplicative.ll
index 7120740fbf34..61228e075dab 100644
--- a/polly/test/ScopInfo/reduction_disabled_multiplicative.ll
+++ b/polly/test/ScopInfo/reduction_disabled_multiplicative.ll
@@ -1,4 +1,4 @@
-; RUN: opt -basic-aa %loadPolly -polly-stmt-granularity=bb -polly-print-scops -polly-disable-multiplicative-reductions -disable-output < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -polly-disable-multiplicative-reductions -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: ReadAccess :=       [Reduction Type: +
 ; CHECK:     { Stmt_for_body[i0] -> MemRef_sum[0] };
diff --git a/polly/test/ScopInfo/reduction_escaping_intermediate.ll b/polly/test/ScopInfo/reduction_escaping_intermediate.ll
index dde09108ecc4..c66a8be0852f 100644
--- a/polly/test/ScopInfo/reduction_escaping_intermediate.ll
+++ b/polly/test/ScopInfo/reduction_escaping_intermediate.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; void f(int N, int * restrict sums, int * restrict escape) {
 ;   int i, j;
diff --git a/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll b/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll
index 702fc56025d9..c574d315b2fe 100644
--- a/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll
+++ b/polly/test/ScopInfo/reduction_escaping_intermediate_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; void f(int N, int * restrict sums, int * restrict escape) {
 ;   int i, j;
diff --git a/polly/test/ScopInfo/reduction_invalid_different_operators.ll b/polly/test/ScopInfo/reduction_invalid_different_operators.ll
index f47919dcad99..9846f1029c08 100644
--- a/polly/test/ScopInfo/reduction_invalid_different_operators.ll
+++ b/polly/test/ScopInfo/reduction_invalid_different_operators.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; int f() {
 ;   int i, sum = 0, sth = 0;
diff --git a/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll b/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll
index be1d7b5bbbd9..4d70e5330455 100644
--- a/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll
+++ b/polly/test/ScopInfo/reduction_invalid_overlapping_accesses.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; void f(int *sums) {
 ;   int i, j;
diff --git a/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll b/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll
index 8d20fa13ffe5..800eb2043dc6 100644
--- a/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll
+++ b/polly/test/ScopInfo/reduction_multiple_loops_array_sum.ll
@@ -1,4 +1,4 @@
-; RUN: opt -basic-aa %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Stmt_for_body
 ; CHECK: Reduction Type: *
diff --git a/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll b/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll
index 782332b56aad..49ebdcb04498 100644
--- a/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll
+++ b/polly/test/ScopInfo/reduction_multiple_loops_array_sum_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt -basic-aa %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Stmt_for_body
 ; CHECK: Reduction Type: NONE
diff --git a/polly/test/ScopInfo/reduction_multiple_simple_binary.ll b/polly/test/ScopInfo/reduction_multiple_simple_binary.ll
index 0f1a3ad90dac..77b71f4df301 100644
--- a/polly/test/ScopInfo/reduction_multiple_simple_binary.ll
+++ b/polly/test/ScopInfo/reduction_multiple_simple_binary.ll
@@ -1,4 +1,4 @@
-; RUN: opt -basic-aa %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: ReadAccess :=       [Reduction Type: NONE
 ; CHECK:     { Stmt_for_body[i0] -> MemRef_A[1 + i0] };
diff --git a/polly/test/ScopInfo/reduction_non_overlapping_chains.ll b/polly/test/ScopInfo/reduction_non_overlapping_chains.ll
index 4e3f841cd8e1..61aaa051e49d 100644
--- a/polly/test/ScopInfo/reduction_non_overlapping_chains.ll
+++ b/polly/test/ScopInfo/reduction_non_overlapping_chains.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: +
 ; CHECK: Reduction Type: +
diff --git a/polly/test/ScopInfo/reduction_only_reduction_like_access.ll b/polly/test/ScopInfo/reduction_only_reduction_like_access.ll
index 0c61d63a2d45..fb6d236764b7 100644
--- a/polly/test/ScopInfo/reduction_only_reduction_like_access.ll
+++ b/polly/test/ScopInfo/reduction_only_reduction_like_access.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: +
 ;
diff --git a/polly/test/ScopInfo/reduction_simple_fp.ll b/polly/test/ScopInfo/reduction_simple_fp.ll
index ba0a034a17e3..aa4cd00f39f5 100644
--- a/polly/test/ScopInfo/reduction_simple_fp.ll
+++ b/polly/test/ScopInfo/reduction_simple_fp.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Function: f_no_fast_math
 ; CHECK: Reduction Type: NONE
diff --git a/polly/test/ScopInfo/reduction_simple_w_constant.ll b/polly/test/ScopInfo/reduction_simple_w_constant.ll
index dc1f8550602d..e385b66f9db2 100644
--- a/polly/test/ScopInfo/reduction_simple_w_constant.ll
+++ b/polly/test/ScopInfo/reduction_simple_w_constant.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: +
 ;
diff --git a/polly/test/ScopInfo/reduction_simple_w_iv.ll b/polly/test/ScopInfo/reduction_simple_w_iv.ll
index b6c3229d08d5..e22eccbb2831 100644
--- a/polly/test/ScopInfo/reduction_simple_w_iv.ll
+++ b/polly/test/ScopInfo/reduction_simple_w_iv.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: +
 ;
diff --git a/polly/test/ScopInfo/reduction_two_identical_reads.ll b/polly/test/ScopInfo/reduction_two_identical_reads.ll
index 19d45a5f4ea9..8f00954f7efc 100644
--- a/polly/test/ScopInfo/reduction_two_identical_reads.ll
+++ b/polly/test/ScopInfo/reduction_two_identical_reads.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Reduction Type: NONE
 ;
diff --git a/polly/test/ScopInfo/redundant_parameter_constraint.ll b/polly/test/ScopInfo/redundant_parameter_constraint.ll
index c9d912191eed..ad71f1f59e18 100644
--- a/polly/test/ScopInfo/redundant_parameter_constraint.ll
+++ b/polly/test/ScopInfo/redundant_parameter_constraint.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The constraint that r2 has to be bigger than r1 is implicitly contained in
 ; the domain, hence we do not want to see it explicitly.
diff --git a/polly/test/ScopInfo/region-with-instructions.ll b/polly/test/ScopInfo/region-with-instructions.ll
index 39d4a72a7814..d4720511b7aa 100644
--- a/polly/test/ScopInfo/region-with-instructions.ll
+++ b/polly/test/ScopInfo/region-with-instructions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -polly-print-instructions -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-print-instructions -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Statements {
 ; CHECK: 	Stmt_bb46
diff --git a/polly/test/ScopInfo/remarks.ll b/polly/test/ScopInfo/remarks.ll
index dcdeb58c7694..2c173a31c46e 100644
--- a/polly/test/ScopInfo/remarks.ll
+++ b/polly/test/ScopInfo/remarks.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-scops \
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
 ; RUN: -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: remark: test/ScopInfo/remarks.c:4:7: SCoP begins here.
diff --git a/polly/test/ScopInfo/required-invariant-loop-bounds.ll b/polly/test/ScopInfo/required-invariant-loop-bounds.ll
index 248acbea6e68..abf0b0e23855 100644
--- a/polly/test/ScopInfo/required-invariant-loop-bounds.ll
+++ b/polly/test/ScopInfo/required-invariant-loop-bounds.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output \
-; RUN: -polly-invariant-load-hoisting=true < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output \
+; RUN: -polly-invariant-load-hoisting=true < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Invariant Accesses: {
 ; CHECK-NEXT:       ReadAccess := [Reduction Type: NONE] [Scalar: 0]
diff --git a/polly/test/ScopInfo/restriction_in_dead_block.ll b/polly/test/ScopInfo/restriction_in_dead_block.ll
index 81d9b96be419..487c585cb9d9 100644
--- a/polly/test/ScopInfo/restriction_in_dead_block.ll
+++ b/polly/test/ScopInfo/restriction_in_dead_block.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we do not generate an empty invalid context only because the wrap
 ; in the second conditional will always happen if the block is executed.
diff --git a/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll b/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll
index d36da2b2becf..702b7dc5e004 100644
--- a/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll
+++ b/polly/test/ScopInfo/run-time-check-many-array-disjuncts.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \
 ; RUN: | FileCheck %s -check-prefix=DETECT
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; DETECT: Valid Region for Scop: bb124 => bb176
 ;
diff --git a/polly/test/ScopInfo/run-time-check-many-parameters.ll b/polly/test/ScopInfo/run-time-check-many-parameters.ll
index 30f8d5fff34c..559c38d2682e 100644
--- a/polly/test/ScopInfo/run-time-check-many-parameters.ll
+++ b/polly/test/ScopInfo/run-time-check-many-parameters.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; A valid Scop would print the list of it's statements, we check that we do not
 ; see that list.
diff --git a/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll b/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll
index 487c803bba98..3cf4c40bdb60 100644
--- a/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll
+++ b/polly/test/ScopInfo/run-time-check-many-piecewise-aliasing.ll
@@ -1,6 +1,6 @@
-; RUN: opt %loadPolly -polly-print-detect -disable-output < %s \
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' -disable-output < %s 2>&1 \
 ; RUN: | FileCheck %s -check-prefix=DETECT
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; DETECT: Valid Region for Scop: for => return
 ;
diff --git a/polly/test/ScopInfo/run-time-check-read-only-arrays.ll b/polly/test/ScopInfo/run-time-check-read-only-arrays.ll
index d590aaf00ddb..51ab81476d54 100644
--- a/polly/test/ScopInfo/run-time-check-read-only-arrays.ll
+++ b/polly/test/ScopInfo/run-time-check-read-only-arrays.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; void foo(float *A, float *B, float *C, long N) {
 ; 	for (long i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/same-base-address-scalar-and-array.ll b/polly/test/ScopInfo/same-base-address-scalar-and-array.ll
index a5f353e7ad2a..dd809ba156c7 100644
--- a/polly/test/ScopInfo/same-base-address-scalar-and-array.ll
+++ b/polly/test/ScopInfo/same-base-address-scalar-and-array.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we introduce two ScopArrayInfo objects (or virtual arrays) for the %out variable
 ; as it is used as a memory base pointer (%0) but also as a scalar (%out.addr.0.lcssa).
diff --git a/polly/test/ScopInfo/scalar.ll b/polly/test/ScopInfo/scalar.ll
index c38eaa853b9b..812d2fddc3c8 100644
--- a/polly/test/ScopInfo/scalar.ll
+++ b/polly/test/ScopInfo/scalar.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 
diff --git a/polly/test/ScopInfo/scalar_dependence_cond_br.ll b/polly/test/ScopInfo/scalar_dependence_cond_br.ll
index 3303bfb7c6c5..59549f3dbbad 100644
--- a/polly/test/ScopInfo/scalar_dependence_cond_br.ll
+++ b/polly/test/ScopInfo/scalar_dependence_cond_br.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output< %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output< %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int c, int d) {
 ;      for (int i = 0; i < 1024; i++)
diff --git a/polly/test/ScopInfo/scalar_to_array.ll b/polly/test/ScopInfo/scalar_to_array.ll
index 5c275108602a..d64f1696c30b 100644
--- a/polly/test/ScopInfo/scalar_to_array.ll
+++ b/polly/test/ScopInfo/scalar_to_array.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -basic-aa -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -basic-aa -polly-print-function-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; ModuleID = 'scalar_to_array.ll'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll b/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll
index fc7a1bfc3d5e..d14569cf0c5d 100644
--- a/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll
+++ b/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Derived from test-suite/SingleSource/UnitTests/Vector/SSE/sse.stepfft.c
 
diff --git a/polly/test/ScopInfo/scev-invalidated.ll b/polly/test/ScopInfo/scev-invalidated.ll
index 97fc5ec3d4ca..6b9efd4b37c7 100644
--- a/polly/test/ScopInfo/scev-invalidated.ll
+++ b/polly/test/ScopInfo/scev-invalidated.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Region: %if.then6---%return
 ;
diff --git a/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll b/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll
index 2fdf7d66c3ad..6e2ed1240b07 100644
--- a/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll
+++ b/polly/test/ScopInfo/schedule-const-post-dominator-walk-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll b/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll
index 92685858610c..d0e8a2accaa2 100644
--- a/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll
+++ b/polly/test/ScopInfo/schedule-const-post-dominator-walk.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll b/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll
index 413d1d8ec556..9ffc30f7360e 100644
--- a/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll
+++ b/polly/test/ScopInfo/schedule-constuction-endless-loop1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not build a SCoP and do not crash.
 ;
diff --git a/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll b/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll
index be254477286f..65f2f99b48c1 100644
--- a/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll
+++ b/polly/test/ScopInfo/schedule-constuction-endless-loop2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Check that we do not build a SCoP and do not crash.
 ;
diff --git a/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll b/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll
index ff339e03fb5a..7c36f8d7f72e 100644
--- a/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll
+++ b/polly/test/ScopInfo/schedule-incorrectly-contructed-in-case-of-infinite-loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-process-unprofitable -polly-scops -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-process-unprofitable '-passes=print<polly-function-scops>' -disable-output < %s
 ;
 ; This test contains a infinite loop (bb13) and crashed the domain generation
 ; at some point. Just verify it does not anymore.
diff --git a/polly/test/ScopInfo/scop-affine-parameter-ordering.ll b/polly/test/ScopInfo/scop-affine-parameter-ordering.ll
index 24c028a6764a..c8a234e9cbce 100644
--- a/polly/test/ScopInfo/scop-affine-parameter-ordering.ll
+++ b/polly/test/ScopInfo/scop-affine-parameter-ordering.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-m:e-i64:64-i128:128-n8:16:32:64-S128"
 target triple = "aarch64--linux-android"
 
diff --git a/polly/test/ScopInfo/sign_wrapped_set.ll b/polly/test/ScopInfo/sign_wrapped_set.ll
index 23c9c8a3b84d..93b63df1c584 100644
--- a/polly/test/ScopInfo/sign_wrapped_set.ll
+++ b/polly/test/ScopInfo/sign_wrapped_set.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-process-unprofitable -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-process-unprofitable '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:        Domain :=
 ; CHECK-NEXT:       [srcHeight] -> { Stmt_for_cond6_preheader_us[i0] : 0 <= i0 <= -3 + srcHeight };
diff --git a/polly/test/ScopInfo/simple_loop_1.ll b/polly/test/ScopInfo/simple_loop_1.ll
index 2c3481facc02..e736f3382d90 100644
--- a/polly/test/ScopInfo/simple_loop_1.ll
+++ b/polly/test/ScopInfo/simple_loop_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], int N) {
 ;   int i;
diff --git a/polly/test/ScopInfo/simple_loop_2.ll b/polly/test/ScopInfo/simple_loop_2.ll
index 2f580094a147..ae83dd633b96 100644
--- a/polly/test/ScopInfo/simple_loop_2.ll
+++ b/polly/test/ScopInfo/simple_loop_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], int N) {
 ;   int i;
diff --git a/polly/test/ScopInfo/simple_loop_unsigned.ll b/polly/test/ScopInfo/simple_loop_unsigned.ll
index 12903d9c1580..c4a96e4381c9 100644
--- a/polly/test/ScopInfo/simple_loop_unsigned.ll
+++ b/polly/test/ScopInfo/simple_loop_unsigned.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], unsigned N) {
 ;   unsigned i;
diff --git a/polly/test/ScopInfo/simple_loop_unsigned_2.ll b/polly/test/ScopInfo/simple_loop_unsigned_2.ll
index 1379180a6dd9..37e907dc006f 100644
--- a/polly/test/ScopInfo/simple_loop_unsigned_2.ll
+++ b/polly/test/ScopInfo/simple_loop_unsigned_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Assumed Context:
 ; CHECK-NEXT: [N] -> {  :  }
diff --git a/polly/test/ScopInfo/simple_loop_unsigned_3.ll b/polly/test/ScopInfo/simple_loop_unsigned_3.ll
index 7783c4681e1f..7f2cf5caa1ce 100644
--- a/polly/test/ScopInfo/simple_loop_unsigned_3.ll
+++ b/polly/test/ScopInfo/simple_loop_unsigned_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Assumed Context:
 ; CHECK-NEXT: [N] -> { : }
diff --git a/polly/test/ScopInfo/simple_nonaffine_loop_not.ll b/polly/test/ScopInfo/simple_nonaffine_loop_not.ll
index 42eff85d8c9b..4df0d343b0fc 100644
--- a/polly/test/ScopInfo/simple_nonaffine_loop_not.ll
+++ b/polly/test/ScopInfo/simple_nonaffine_loop_not.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | not FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | not FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 @.str = private unnamed_addr constant [17 x i8] c"Random Value: %d\00", align 1
diff --git a/polly/test/ScopInfo/smax.ll b/polly/test/ScopInfo/smax.ll
index b938e4e412da..8968e1319247 100644
--- a/polly/test/ScopInfo/smax.ll
+++ b/polly/test/ScopInfo/smax.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:32-n32-S64"
 
 define void @foo(ptr noalias %data, ptr noalias %ptr, i32 %x_pos, i32 %w) {
diff --git a/polly/test/ScopInfo/statistics.ll b/polly/test/ScopInfo/statistics.ll
index 3797b7d71df9..0a294f2016eb 100644
--- a/polly/test/ScopInfo/statistics.ll
+++ b/polly/test/ScopInfo/statistics.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-scops -stats -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -stats -disable-output < %s 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 ; CHECK-DAG:  4 polly-scops      - Maximal number of loops in scops
diff --git a/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll b/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll
index d86d2418cf9b..a46acb090b7f 100644
--- a/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll
+++ b/polly/test/ScopInfo/stmt_split_exit_of_region_stmt.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:   Stmt_Region__TO__Stmt
diff --git a/polly/test/ScopInfo/stmt_split_no_after_split.ll b/polly/test/ScopInfo/stmt_split_no_after_split.ll
index f8339bd8ae94..3a5ebf0725b1 100644
--- a/polly/test/ScopInfo/stmt_split_no_after_split.ll
+++ b/polly/test/ScopInfo/stmt_split_no_after_split.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT: 	Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_split_no_dependence.ll b/polly/test/ScopInfo/stmt_split_no_dependence.ll
index 7ad48f499792..9edd0f0a13e5 100644
--- a/polly/test/ScopInfo/stmt_split_no_dependence.ll
+++ b/polly/test/ScopInfo/stmt_split_no_dependence.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ;      void func(int *A, int *B){
 ;        for (int i = 0; i < 1024; i+=1) {
diff --git a/polly/test/ScopInfo/stmt_split_on_store.ll b/polly/test/ScopInfo/stmt_split_on_store.ll
index 6af3dc8633dd..d645becb1958 100644
--- a/polly/test/ScopInfo/stmt_split_on_store.ll
+++ b/polly/test/ScopInfo/stmt_split_on_store.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=store -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=store -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ;      void func(int *A, int *B){
 ;        for (int i = 0; i < 1024; i+=1) {
diff --git a/polly/test/ScopInfo/stmt_split_on_synthesizable.ll b/polly/test/ScopInfo/stmt_split_on_synthesizable.ll
index 92855cfd0124..1a1ccff4f02d 100644
--- a/polly/test/ScopInfo/stmt_split_on_synthesizable.ll
+++ b/polly/test/ScopInfo/stmt_split_on_synthesizable.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:   Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll b/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll
index ee6afa4638d2..594b36279d6b 100644
--- a/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll
+++ b/polly/test/ScopInfo/stmt_split_phi_in_beginning_bb.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:  	Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll b/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll
index 0a5f41d637e7..6c9f1c2cb5fd 100644
--- a/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll
+++ b/polly/test/ScopInfo/stmt_split_phi_in_stmt.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:  	Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_split_scalar_dependence.ll b/polly/test/ScopInfo/stmt_split_scalar_dependence.ll
index 5b02d1b5d08a..07abe46ac039 100644
--- a/polly/test/ScopInfo/stmt_split_scalar_dependence.ll
+++ b/polly/test/ScopInfo/stmt_split_scalar_dependence.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:  	Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_split_within_loop.ll b/polly/test/ScopInfo/stmt_split_within_loop.ll
index 3ed9bbbeaccb..9a42ae3a3727 100644
--- a/polly/test/ScopInfo/stmt_split_within_loop.ll
+++ b/polly/test/ScopInfo/stmt_split_within_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-instructions -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-print-instructions '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Statements {
 ; CHECK-NEXT:  	Stmt_Stmt
diff --git a/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll b/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll
index 73fc543a66e8..ba4801d9a000 100644
--- a/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll
+++ b/polly/test/ScopInfo/stmt_with_read_but_without_sideffect.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-delicm -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; The statement Stmt_for_if_else_1 should be removed because it has no
 ; sideeffects.  But it has a use of MemRef_tmp21 that must also be
diff --git a/polly/test/ScopInfo/switch-1.ll b/polly/test/ScopInfo/switch-1.ll
index 0ea40a7ed251..0c3610185e6e 100644
--- a/polly/test/ScopInfo/switch-1.ll
+++ b/polly/test/ScopInfo/switch-1.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/switch-2.ll b/polly/test/ScopInfo/switch-2.ll
index 7956058c9de6..f0056da37955 100644
--- a/polly/test/ScopInfo/switch-2.ll
+++ b/polly/test/ScopInfo/switch-2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/switch-3.ll b/polly/test/ScopInfo/switch-3.ll
index aa7ada4edbb8..a1810bf6ef53 100644
--- a/polly/test/ScopInfo/switch-3.ll
+++ b/polly/test/ScopInfo/switch-3.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/switch-4.ll b/polly/test/ScopInfo/switch-4.ll
index 6aeb7197e382..00665fd75cbc 100644
--- a/polly/test/ScopInfo/switch-4.ll
+++ b/polly/test/ScopInfo/switch-4.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/switch-5.ll b/polly/test/ScopInfo/switch-5.ll
index 24cc92a0933d..2de369564940 100644
--- a/polly/test/ScopInfo/switch-5.ll
+++ b/polly/test/ScopInfo/switch-5.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ; The SCoP contains a loop with multiple exit blocks (BBs after leaving
 ; the loop). The current implementation of deriving their domain derives
diff --git a/polly/test/ScopInfo/switch-6.ll b/polly/test/ScopInfo/switch-6.ll
index efb3df504d23..b859840ee111 100644
--- a/polly/test/ScopInfo/switch-6.ll
+++ b/polly/test/ScopInfo/switch-6.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int N) {
 ;      for (int i = 0; i < N; i++) {
diff --git a/polly/test/ScopInfo/switch-7.ll b/polly/test/ScopInfo/switch-7.ll
index 2f0d034e84fe..f73d97f70b28 100644
--- a/polly/test/ScopInfo/switch-7.ll
+++ b/polly/test/ScopInfo/switch-7.ll
@@ -1,6 +1,5 @@
-
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-ast -disable-output < %s | FileCheck %s --check-prefix=AST
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-ast>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=AST
 ;
 ;    void f(int *A, int c, int N) {
 ;      switch (c) {
diff --git a/polly/test/ScopInfo/tempscop-printing.ll b/polly/test/ScopInfo/tempscop-printing.ll
index 80c675d4c3d3..4f02176569b7 100644
--- a/polly/test/ScopInfo/tempscop-printing.ll
+++ b/polly/test/ScopInfo/tempscop-printing.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -basic-aa -polly-invariant-load-hoisting=true -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -aa-pipeline=basic-aa -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(long A[], int N, int *init_ptr) {
 ;   long i, j;
diff --git a/polly/test/ScopInfo/test-wrapping-in-condition.ll b/polly/test/ScopInfo/test-wrapping-in-condition.ll
index 3ff978f7265e..746350422d6b 100644
--- a/polly/test/ScopInfo/test-wrapping-in-condition.ll
+++ b/polly/test/ScopInfo/test-wrapping-in-condition.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-print-function-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Invalid Context:
 ; CHECK:        [N] -> {  : N >= 129 }
diff --git a/polly/test/ScopInfo/truncate-1.ll b/polly/test/ScopInfo/truncate-1.ll
index 5c5fac150b4b..44222c88dfa7 100644
--- a/polly/test/ScopInfo/truncate-1.ll
+++ b/polly/test/ScopInfo/truncate-1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(char *A, short N) {
 ;      for (char i = 0; i < (char)N; i++)
diff --git a/polly/test/ScopInfo/truncate-2.ll b/polly/test/ScopInfo/truncate-2.ll
index e6c5f2cb32d0..c78a5337fdeb 100644
--- a/polly/test/ScopInfo/truncate-2.ll
+++ b/polly/test/ScopInfo/truncate-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(char *A, short N) {
 ;      for (short i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/truncate-3.ll b/polly/test/ScopInfo/truncate-3.ll
index dd0fe489e990..5a80a873cd47 100644
--- a/polly/test/ScopInfo/truncate-3.ll
+++ b/polly/test/ScopInfo/truncate-3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-scops -pass-remarks-analysis="polly-scops" \
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -pass-remarks-analysis="polly-scops" \
 ; RUN:                -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: Signed-unsigned restriction: [p] -> {  : p <= -129 or p >= 128 }
diff --git a/polly/test/ScopInfo/two-loops-one-infinite.ll b/polly/test/ScopInfo/two-loops-one-infinite.ll
index 71f72383b048..e2723a8a9a2e 100644
--- a/polly/test/ScopInfo/two-loops-one-infinite.ll
+++ b/polly/test/ScopInfo/two-loops-one-infinite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify we do not create a SCoP in the presence of infinite loops.
 ;
diff --git a/polly/test/ScopInfo/two-loops-right-after-each-other.ll b/polly/test/ScopInfo/two-loops-right-after-each-other.ll
index dd457c31afdd..51f3c2d6eb87 100644
--- a/polly/test/ScopInfo/two-loops-right-after-each-other.ll
+++ b/polly/test/ScopInfo/two-loops-right-after-each-other.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK:      Statements {
 ; CHECK-NEXT:     Stmt_loop_1
diff --git a/polly/test/ScopInfo/undef_in_cond.ll b/polly/test/ScopInfo/undef_in_cond.ll
index 5282a853c17a..ef117612f6cb 100644
--- a/polly/test/ScopInfo/undef_in_cond.ll
+++ b/polly/test/ScopInfo/undef_in_cond.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define fastcc void @fix_operands() nounwind {
diff --git a/polly/test/ScopInfo/unnamed_nonaffine.ll b/polly/test/ScopInfo/unnamed_nonaffine.ll
index bf32cc7806f4..5b9f98059177 100644
--- a/polly/test/ScopInfo/unnamed_nonaffine.ll
+++ b/polly/test/ScopInfo/unnamed_nonaffine.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-use-llvm-names=true  -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-allow-nonaffine -polly-use-llvm-names=false -polly-print-scops -disable-output < %s | FileCheck %s -check-prefix=UNNAMED
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-use-llvm-names=true  '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-use-llvm-names=false '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -check-prefix=UNNAMED
 ;
 ;    void f(int *A, int b) {
 ;      int x;
diff --git a/polly/test/ScopInfo/unnamed_stmts.ll b/polly/test/ScopInfo/unnamed_stmts.ll
index 686c0f87d9cf..5a189454471f 100644
--- a/polly/test/ScopInfo/unnamed_stmts.ll
+++ b/polly/test/ScopInfo/unnamed_stmts.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; This test case verifies that we generate numbered statement names in case
 ; no LLVM-IR names are used in the test case. We also verify, that we
diff --git a/polly/test/ScopInfo/unpredictable_nonscop_loop.ll b/polly/test/ScopInfo/unpredictable_nonscop_loop.ll
index 0656b77e3409..daa1f8c78387 100644
--- a/polly/test/ScopInfo/unpredictable_nonscop_loop.ll
+++ b/polly/test/ScopInfo/unpredictable_nonscop_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s -match-full-lines
 ; Derived from test-suite/MultiSource/Applications/sgefa/blas.c
 ;
 ; The exit value of %i.0320 in land.rhs is not computable.
diff --git a/polly/test/ScopInfo/unprofitable_scalar-accs.ll b/polly/test/ScopInfo/unprofitable_scalar-accs.ll
index 9703587091a7..ca8daa4de01a 100644
--- a/polly/test/ScopInfo/unprofitable_scalar-accs.ll
+++ b/polly/test/ScopInfo/unprofitable_scalar-accs.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=true  -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=HEURISTIC
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb -polly-process-unprofitable=false -polly-unprofitable-scalar-accs=true  '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=HEURISTIC
 
 ; Check the effect of -polly-unprofitable-scalar-accs
 
diff --git a/polly/test/ScopInfo/unsigned-condition.ll b/polly/test/ScopInfo/unsigned-condition.ll
index 35673d1b6a36..0529ded1f6cf 100644
--- a/polly/test/ScopInfo/unsigned-condition.ll
+++ b/polly/test/ScopInfo/unsigned-condition.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], int N, unsigned P) {
 ;   int i;
diff --git a/polly/test/ScopInfo/unsigned-division-1.ll b/polly/test/ScopInfo/unsigned-division-1.ll
index 8c65062bd941..1c06b55300b6 100644
--- a/polly/test/ScopInfo/unsigned-division-1.ll
+++ b/polly/test/ScopInfo/unsigned-division-1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, unsigned N) {
 ;      for (unsigned i = 0; i < N / 2; i++)
diff --git a/polly/test/ScopInfo/unsigned-division-2.ll b/polly/test/ScopInfo/unsigned-division-2.ll
index bf4ebce9099a..153639c42b38 100644
--- a/polly/test/ScopInfo/unsigned-division-2.ll
+++ b/polly/test/ScopInfo/unsigned-division-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, unsigned N) {
 ;      for (unsigned i = 0; i < N / 2 + 3; i++)
diff --git a/polly/test/ScopInfo/unsigned-division-3.ll b/polly/test/ScopInfo/unsigned-division-3.ll
index 47ba1f2ef09d..34561fc4645c 100644
--- a/polly/test/ScopInfo/unsigned-division-3.ll
+++ b/polly/test/ScopInfo/unsigned-division-3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, unsigned char N) {
 ;      for (unsigned i = 0; i <= N / -128; i++)
diff --git a/polly/test/ScopInfo/unsigned-division-4.ll b/polly/test/ScopInfo/unsigned-division-4.ll
index edcd8a18a854..be539b47123b 100644
--- a/polly/test/ScopInfo/unsigned-division-4.ll
+++ b/polly/test/ScopInfo/unsigned-division-4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, unsigned char N) {
 ;      for (unsigned i = 0; i < (N / -128) + 3; i++)
diff --git a/polly/test/ScopInfo/unsigned-division-5.ll b/polly/test/ScopInfo/unsigned-division-5.ll
index f9a3d39288a9..61716ecec0d9 100644
--- a/polly/test/ScopInfo/unsigned-division-5.ll
+++ b/polly/test/ScopInfo/unsigned-division-5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting=true -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, unsigned N) {
 ;      for (unsigned i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/unsigned_wrap_uge.ll b/polly/test/ScopInfo/unsigned_wrap_uge.ll
index 89c50ee3764b..d25a9576e863 100644
--- a/polly/test/ScopInfo/unsigned_wrap_uge.ll
+++ b/polly/test/ScopInfo/unsigned_wrap_uge.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Unsigned wrap-around check.
 ;
diff --git a/polly/test/ScopInfo/unsigned_wrap_ugt.ll b/polly/test/ScopInfo/unsigned_wrap_ugt.ll
index 3249123c9918..0310fdde6d26 100644
--- a/polly/test/ScopInfo/unsigned_wrap_ugt.ll
+++ b/polly/test/ScopInfo/unsigned_wrap_ugt.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Unsigned wrap-around check.
 ;
diff --git a/polly/test/ScopInfo/unsigned_wrap_ule.ll b/polly/test/ScopInfo/unsigned_wrap_ule.ll
index 3c6ea18b439c..47bfc6065b1a 100644
--- a/polly/test/ScopInfo/unsigned_wrap_ule.ll
+++ b/polly/test/ScopInfo/unsigned_wrap_ule.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Unsigned wrap-around check.
 ;
diff --git a/polly/test/ScopInfo/unsigned_wrap_ult.ll b/polly/test/ScopInfo/unsigned_wrap_ult.ll
index 5d859f85d52b..1b73c0d6dd7e 100644
--- a/polly/test/ScopInfo/unsigned_wrap_ult.ll
+++ b/polly/test/ScopInfo/unsigned_wrap_ult.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Unsigned wrap-around check.
 ;
diff --git a/polly/test/ScopInfo/user_context.ll b/polly/test/ScopInfo/user_context.ll
index 46232cd59c03..74088120e401 100644
--- a/polly/test/ScopInfo/user_context.ll
+++ b/polly/test/ScopInfo/user_context.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly                                      -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-context='[N] -> {: N = 1024}' -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=CTX
-; RUN: opt %loadPolly -polly-context='[N,M] -> {: 1 = 0}'  -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-context='[] -> {: 1 = 0}'     -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly                                      '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-context='[N] -> {: N = 1024}' '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=CTX
+; RUN: opt %loadNPMPolly -polly-context='[N,M] -> {: 1 = 0}'  '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-context='[] -> {: 1 = 0}'     '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 
 ; void f(int a[], int N) {
 ;   int i;
diff --git a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll
index 4bd02c96a3d2..bd13ba8bb696 100644
--- a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed-conditional.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; REMARK: remark: <unknown>:0:0: Use user assumption: [n, b] -> {  : n <= 100 or (b = 0 and n >= 101) }
 ;
diff --git a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll
index 262bd1349a69..45f59170942e 100644
--- a/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions-in-bb-signed.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK: Context:
 ; CHECK-NEXT: [n] -> {  : -9223372036854775808 <= n <= 100 }
diff --git a/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll b/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll
index 4a10fcff929a..fb71c75aa75e 100644
--- a/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions-in-bb-unsigned.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-scops -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; REMARK:      remark: <unknown>:0:0: SCoP begins here.
 ; REMARK-NEXT: remark: <unknown>:0:0: Use user assumption: [n] -> {  : n <= 100 }
diff --git a/polly/test/ScopInfo/user_provided_assumptions.ll b/polly/test/ScopInfo/user_provided_assumptions.ll
index 6640e4a65e36..49b23b1e784d 100644
--- a/polly/test/ScopInfo/user_provided_assumptions.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
 ;
 ; CHECK:      remark: <unknown>:0:0: SCoP begins here.
 ; CHECK-NEXT: remark: <unknown>:0:0: Use user assumption: [M, N] -> {  : N <= 2147483647 - M }
diff --git a/polly/test/ScopInfo/user_provided_assumptions_2.ll b/polly/test/ScopInfo/user_provided_assumptions_2.ll
index 994cd6f15103..f8643b68cc63 100644
--- a/polly/test/ScopInfo/user_provided_assumptions_2.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions_2.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
 ;
 ; CHECK:      remark: <unknown>:0:0: SCoP begins here.
 ; CHECK-NEXT: remark: <unknown>:0:0: Use user assumption: { : }
diff --git a/polly/test/ScopInfo/user_provided_assumptions_3.ll b/polly/test/ScopInfo/user_provided_assumptions_3.ll
index 2fcde8bd1826..70f8f359e16c 100644
--- a/polly/test/ScopInfo/user_provided_assumptions_3.ll
+++ b/polly/test/ScopInfo/user_provided_assumptions_3.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-scops -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s --check-prefix=SCOP
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCOP
 ;
 ; CHECK:      remark: <unknown>:0:0: SCoP begins here.
 ; CHECK-NEXT: remark: <unknown>:0:0: Use user assumption: [N] -> { : N >= 2 }
diff --git a/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll b/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll
index 1eb3c15810e4..3e7883db48fc 100644
--- a/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll
+++ b/polly/test/ScopInfo/user_provided_non_dominating_assumptions.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-scops \
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
 ; RUN:    -polly-precise-inbounds -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      remark: <unknown>:0:0: SCoP begins here.
@@ -18,7 +18,7 @@
 ;
 
 
-; RUN: opt %loadPolly -pass-remarks-analysis="polly-scops" -polly-scops \
+; RUN: opt %loadNPMPolly -pass-remarks-analysis="polly-scops" '-passes=print<polly-function-scops>' \
 ; RUN:    -polly-precise-inbounds -disable-output < %s 2>&1 -pass-remarks-output=%t.yaml
 ; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
 ; YAML: --- !Analysis
diff --git a/polly/test/ScopInfo/variant_base_pointer.ll b/polly/test/ScopInfo/variant_base_pointer.ll
index 321657c87e79..32cb114fab05 100644
--- a/polly/test/ScopInfo/variant_base_pointer.ll
+++ b/polly/test/ScopInfo/variant_base_pointer.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true -polly-print-scops -disable-output < %s | FileCheck %s
-; RUN: opt %loadPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true -polly-codegen -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-ignore-aliasing -polly-invariant-load-hoisting=true -passes=polly-codegen -disable-output < %s
 ;
 ; %tmp is added to the list of required hoists by -polly-scops and just
 ; assumed to be hoisted. Only -polly-scops recognizes it to be unhoistable
diff --git a/polly/test/ScopInfo/variant_load_empty_domain.ll b/polly/test/ScopInfo/variant_load_empty_domain.ll
index 0e685c3c7e73..6a28bd0405fd 100644
--- a/polly/test/ScopInfo/variant_load_empty_domain.ll
+++ b/polly/test/ScopInfo/variant_load_empty_domain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:      Invariant Accesses: {
 ; CHECK-NEXT: }
diff --git a/polly/test/ScopInfo/wraping_signed_expr_0.ll b/polly/test/ScopInfo/wraping_signed_expr_0.ll
index 7ad0f64028b6..f5f06bfd7d33 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_0.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_0.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, char N, char p) {
 ;      for (char i = 0; i < N; i++) {
diff --git a/polly/test/ScopInfo/wraping_signed_expr_1.ll b/polly/test/ScopInfo/wraping_signed_expr_1.ll
index 0a62b9cf542c..e04257acc201 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_1.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(long *A, long N, long p) {
 ;      for (long i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/wraping_signed_expr_2.ll b/polly/test/ScopInfo/wraping_signed_expr_2.ll
index f3b4665f7f37..2511c0d64608 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_2.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int N, int p) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/wraping_signed_expr_3.ll b/polly/test/ScopInfo/wraping_signed_expr_3.ll
index 7a5cbba9436b..2106bdf4c068 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_3.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_3.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(int *A, int N, int p) {
 ;      for (int i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/wraping_signed_expr_4.ll b/polly/test/ScopInfo/wraping_signed_expr_4.ll
index ec65f70a092f..3ea17f6e266b 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_4.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_4.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(char *A, char N, char p) {
 ;      for (char i = 0; i < N; i++)
diff --git a/polly/test/ScopInfo/wraping_signed_expr_5.ll b/polly/test/ScopInfo/wraping_signed_expr_5.ll
index 5f3b09ba33c1..90706a3d3bc4 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_5.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_5.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; We should not generate runtime check for ((int)r1 + (int)r2) as it is known not
 ; to overflow. However (p + q) can, thus checks are needed.
diff --git a/polly/test/ScopInfo/wraping_signed_expr_6.ll b/polly/test/ScopInfo/wraping_signed_expr_6.ll
index 23258bb513bf..9cf67fc10180 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_6.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_6.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Invalid Context:
 ; CHECK:        [N] -> {  : N >= 129 }
diff --git a/polly/test/ScopInfo/wraping_signed_expr_7.ll b/polly/test/ScopInfo/wraping_signed_expr_7.ll
index 0663d4e0bc10..d18d2b2df3e1 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_7.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_7.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:    Invalid Context:
 ; CHECK:        [N] -> {  : N >= 129 }
diff --git a/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll b/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll
index ec36d2c5fcde..84626861bd39 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_slow_1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; This checks that the no-wraps checks will be computed fast as some example
 ; already showed huge slowdowns even though the inbounds and nsw flags were
diff --git a/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll b/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll
index 6db33ab166d5..b4dd567bafa6 100644
--- a/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll
+++ b/polly/test/ScopInfo/wraping_signed_expr_slow_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; This checks that the no-wraps checks will be computed fast as some example
 ; already showed huge slowdowns even though the inbounds and nsw flags were
diff --git a/polly/test/ScopInfo/zero_ext_of_truncate.ll b/polly/test/ScopInfo/zero_ext_of_truncate.ll
index fc55df5e053c..bd3749b6aa74 100644
--- a/polly/test/ScopInfo/zero_ext_of_truncate.ll
+++ b/polly/test/ScopInfo/zero_ext_of_truncate.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting=true -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(unsigned *restrict I, unsigned *restrict A, unsigned N, unsigned M) {
 ;      for (unsigned i = 0; i < N; i++) {
diff --git a/polly/test/ScopInfo/zero_ext_of_truncate_2.ll b/polly/test/ScopInfo/zero_ext_of_truncate_2.ll
index 13e9c03ecd2d..b30604527676 100644
--- a/polly/test/ScopInfo/zero_ext_of_truncate_2.ll
+++ b/polly/test/ScopInfo/zero_ext_of_truncate_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-invariant-load-hoisting=true -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-invariant-load-hoisting=true '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ;    void f(unsigned long *restrict I, unsigned *restrict A, unsigned N) {
 ;      for (unsigned i = 0; i < N; i++) {
diff --git a/polly/test/ScopInfo/zero_ext_space_mismatch.ll b/polly/test/ScopInfo/zero_ext_space_mismatch.ll
index 835a8664b75e..3c02ae295b5b 100644
--- a/polly/test/ScopInfo/zero_ext_space_mismatch.ll
+++ b/polly/test/ScopInfo/zero_ext_space_mismatch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; CHECK:         Assumed Context:
 ; CHECK-NEXT:    [dim] -> {  : dim > 0 }
diff --git a/polly/test/ScopInliner/invariant-load-func.ll b/polly/test/ScopInliner/invariant-load-func.ll
index 38e4a15aab94..ffd2ec9cdb60 100644
--- a/polly/test/ScopInliner/invariant-load-func.ll
+++ b/polly/test/ScopInliner/invariant-load-func.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-detect-full-functions -polly-scop-inliner \
-; RUN: -polly-invariant-load-hoisting -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-detect-full-functions -polly-scop-inliner \
+; RUN: -polly-invariant-load-hoisting '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s
 
 ; Check that we inline a function that requires invariant load hoisting
 ; correctly.
diff --git a/polly/test/Simplify/coalesce_3partials.ll b/polly/test/Simplify/coalesce_3partials.ll
index 0c1556ff263a..4112787e51bf 100644
--- a/polly/test/Simplify/coalesce_3partials.ll
+++ b/polly/test/Simplify/coalesce_3partials.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Combine 3 partial accesses into one.
 ;
diff --git a/polly/test/Simplify/coalesce_disjointelements.ll b/polly/test/Simplify/coalesce_disjointelements.ll
index 2f4cf4e3f920..b140f287e27f 100644
--- a/polly/test/Simplify/coalesce_disjointelements.ll
+++ b/polly/test/Simplify/coalesce_disjointelements.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Combine four partial stores into two.
 ; The stores write to the same array, but never the same element.
diff --git a/polly/test/Simplify/coalesce_overlapping.ll b/polly/test/Simplify/coalesce_overlapping.ll
index 78ed21e9855b..ee716fc12f09 100644
--- a/polly/test/Simplify/coalesce_overlapping.ll
+++ b/polly/test/Simplify/coalesce_overlapping.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Combine two partial stores (with overlapping domains) into one.
 ;
diff --git a/polly/test/Simplify/coalesce_partial.ll b/polly/test/Simplify/coalesce_partial.ll
index c42aaa113035..aea691f43e93 100644
--- a/polly/test/Simplify/coalesce_partial.ll
+++ b/polly/test/Simplify/coalesce_partial.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Combine two partial stores (with disjoint domains) into one.
 ;
diff --git a/polly/test/Simplify/dead_access_load.ll b/polly/test/Simplify/dead_access_load.ll
index 1804613c0a79..66f94795ea6e 100644
--- a/polly/test/Simplify/dead_access_load.ll
+++ b/polly/test/Simplify/dead_access_load.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Remove a dead load-instruction
diff --git a/polly/test/Simplify/dead_access_phi.ll b/polly/test/Simplify/dead_access_phi.ll
index d263b89aff58..fb40e4cc45b3 100644
--- a/polly/test/Simplify/dead_access_phi.ll
+++ b/polly/test/Simplify/dead_access_phi.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Remove a dead PHI write/read pair
diff --git a/polly/test/Simplify/dead_access_value.ll b/polly/test/Simplify/dead_access_value.ll
index 6e3c211577f6..a8ff7f28542b 100644
--- a/polly/test/Simplify/dead_access_value.ll
+++ b/polly/test/Simplify/dead_access_value.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Remove a dead value write/read pair
diff --git a/polly/test/Simplify/dead_instruction.ll b/polly/test/Simplify/dead_instruction.ll
index 4e693b0ccb44..81e55e1c7bb3 100644
--- a/polly/test/Simplify/dead_instruction.ll
+++ b/polly/test/Simplify/dead_instruction.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Remove a dead instruction
diff --git a/polly/test/Simplify/emptyaccessdomain.ll b/polly/test/Simplify/emptyaccessdomain.ll
index 54ac14ab398c..9b06cec965a9 100644
--- a/polly/test/Simplify/emptyaccessdomain.ll
+++ b/polly/test/Simplify/emptyaccessdomain.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; for (int j = 0; j < n; j += 1) {
 ;   A[0] = 42.0;
diff --git a/polly/test/Simplify/exit_phi_accesses-2.ll b/polly/test/Simplify/exit_phi_accesses-2.ll
index 01748aa59bd3..379c7e0ace0a 100644
--- a/polly/test/Simplify/exit_phi_accesses-2.ll
+++ b/polly/test/Simplify/exit_phi_accesses-2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-scops -polly-print-simplify -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>,scop(print<polly-simplify>)' -disable-output < %s | FileCheck %s
 ;
 ; The use of %sum.next by %phi counts as an escaping use.
 ; Don't remove the scalar write of %sum.next.
diff --git a/polly/test/Simplify/func-b320a7.ll b/polly/test/Simplify/func-b320a7.ll
index c8a823a468d7..5aa2caba95cf 100644
--- a/polly/test/Simplify/func-b320a7.ll
+++ b/polly/test/Simplify/func-b320a7.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-print-simplify -polly-optree -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=print<polly-simplify>,polly-optree' -disable-output < %s | FileCheck %s -match-full-lines
 
 ; llvm.org/PR47098
 ; Use-after-free by reference to Stmt remaining in InstStmtMap after removing it has been removed by Scop::simplifyScop.
diff --git a/polly/test/Simplify/gemm.ll b/polly/test/Simplify/gemm.ll
index 23f8de5573cd..5120de2db767 100644
--- a/polly/test/Simplify/gemm.ll
+++ b/polly/test/Simplify/gemm.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
 ;
 ;    void gemm(float A[][1024], float B[][1024], float C[][1024]) {
 ;      for (long i = 0; i < 1024; i++)
diff --git a/polly/test/Simplify/nocoalesce_differentvalues.ll b/polly/test/Simplify/nocoalesce_differentvalues.ll
index 68991d2eecf5..33d04b2f96de 100644
--- a/polly/test/Simplify/nocoalesce_differentvalues.ll
+++ b/polly/test/Simplify/nocoalesce_differentvalues.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Do not combine stores that write different values.
 ;
diff --git a/polly/test/Simplify/nocoalesce_elementmismatch.ll b/polly/test/Simplify/nocoalesce_elementmismatch.ll
index 2bab360e6858..608b055e691d 100644
--- a/polly/test/Simplify/nocoalesce_elementmismatch.ll
+++ b/polly/test/Simplify/nocoalesce_elementmismatch.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Do not combine stores that do not write to different elements in the
 ; same instance.
diff --git a/polly/test/Simplify/nocoalesce_readbetween.ll b/polly/test/Simplify/nocoalesce_readbetween.ll
index ada79dc18b87..e112b036cd77 100644
--- a/polly/test/Simplify/nocoalesce_readbetween.ll
+++ b/polly/test/Simplify/nocoalesce_readbetween.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Do not combine stores if there is a read between them.
 ; Note: The read between is unused, so will be removed by markAndSweep.
diff --git a/polly/test/Simplify/nocoalesce_writebetween.ll b/polly/test/Simplify/nocoalesce_writebetween.ll
index 48e785ec2c26..fd5eee52eaf5 100644
--- a/polly/test/Simplify/nocoalesce_writebetween.ll
+++ b/polly/test/Simplify/nocoalesce_writebetween.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Do not combine stores if there is a write between them.
 ;
diff --git a/polly/test/Simplify/notdead_region_exitphi.ll b/polly/test/Simplify/notdead_region_exitphi.ll
index bd29fd578b97..42fafb446cea 100644
--- a/polly/test/Simplify/notdead_region_exitphi.ll
+++ b/polly/test/Simplify/notdead_region_exitphi.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Do not remove dependencies of a phi node in a region's exit block.
diff --git a/polly/test/Simplify/notdead_region_innerphi.ll b/polly/test/Simplify/notdead_region_innerphi.ll
index a176a28af233..966448c9884b 100644
--- a/polly/test/Simplify/notdead_region_innerphi.ll
+++ b/polly/test/Simplify/notdead_region_innerphi.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Do not remove dependencies of a phi node within a region statement (%phi).
diff --git a/polly/test/Simplify/notredundant_region_loop.ll b/polly/test/Simplify/notredundant_region_loop.ll
index 0ea9be7e9d2d..88f6c4152173 100644
--- a/polly/test/Simplify/notredundant_region_loop.ll
+++ b/polly/test/Simplify/notredundant_region_loop.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-allow-nonaffine-loops -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -polly-allow-nonaffine-loops -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Do not remove the store in region_entry. It can be executed multiple times
 ; due to being part of a non-affine loop.
diff --git a/polly/test/Simplify/notredundant_region_middle.ll b/polly/test/Simplify/notredundant_region_middle.ll
index 84598746e0bb..43c05436809b 100644
--- a/polly/test/Simplify/notredundant_region_middle.ll
+++ b/polly/test/Simplify/notredundant_region_middle.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Do not remove redundant stores in the middle of region statements.
diff --git a/polly/test/Simplify/notredundant_synthesizable_unknownit.ll b/polly/test/Simplify/notredundant_synthesizable_unknownit.ll
index 2affdbb2f1de..8a9aec8be9e0 100644
--- a/polly/test/Simplify/notredundant_synthesizable_unknownit.ll
+++ b/polly/test/Simplify/notredundant_synthesizable_unknownit.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Do not remove the scalar value write of %i.trunc in inner.for.
diff --git a/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll b/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll
index 511f35a9388e..7218f328f9ca 100644
--- a/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll
+++ b/polly/test/Simplify/out-of-scop-use-in-region-entry-phi-node.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-scops -polly-print-simplify -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb '-passes=print<polly-function-scops>,scop(print<polly-simplify>)' -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; %tmp5 must keep the Value WRITE MemoryAccess, because as an incoming value of
 ; %tmp4, it is an "external use".
diff --git a/polly/test/Simplify/overwritten.ll b/polly/test/Simplify/overwritten.ll
index a32d6a8daeb0..eccdd8044d07 100644
--- a/polly/test/Simplify/overwritten.ll
+++ b/polly/test/Simplify/overwritten.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s
 ;
 ; Remove a store that is overwritten by another store in the same statement.
diff --git a/polly/test/Simplify/overwritten_3phi.ll b/polly/test/Simplify/overwritten_3phi.ll
index 24758b9b7cf9..4cee4f13d26d 100644
--- a/polly/test/Simplify/overwritten_3phi.ll
+++ b/polly/test/Simplify/overwritten_3phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Remove identical writes
 ; (two stores in the same statement that write the same value to the same
diff --git a/polly/test/Simplify/overwritten_3store.ll b/polly/test/Simplify/overwritten_3store.ll
index 63eb5b54f931..c9f06c85dba5 100644
--- a/polly/test/Simplify/overwritten_3store.ll
+++ b/polly/test/Simplify/overwritten_3store.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-stmt-granularity=bb -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
 ; RUN: opt %loadNPMPolly -polly-stmt-granularity=bb "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s
 ;
 ; Remove a store that is overwritten by another store in the same statement.
diff --git a/polly/test/Simplify/overwritten_implicit_and_explicit.ll b/polly/test/Simplify/overwritten_implicit_and_explicit.ll
index 56c63b48f761..b1b7635e2626 100644
--- a/polly/test/Simplify/overwritten_implicit_and_explicit.ll
+++ b/polly/test/Simplify/overwritten_implicit_and_explicit.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Remove a store that is overwritten by another store in the same statement.
 ; Check that this works even if one of the writes is a scalar MemoryKind.
diff --git a/polly/test/Simplify/overwritten_loadbetween.ll b/polly/test/Simplify/overwritten_loadbetween.ll
index b31f45d5db62..cdca2f11531e 100644
--- a/polly/test/Simplify/overwritten_loadbetween.ll
+++ b/polly/test/Simplify/overwritten_loadbetween.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck -match-full-lines %s
 ;
 ; Do not remove overwrites when the value is read before.
diff --git a/polly/test/Simplify/overwritten_scalar.ll b/polly/test/Simplify/overwritten_scalar.ll
index d55ea7712c36..700adb6aed2e 100644
--- a/polly/test/Simplify/overwritten_scalar.ll
+++ b/polly/test/Simplify/overwritten_scalar.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck -match-full-lines %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck -match-full-lines %s
 ;
 ; Remove identical writes
 ; (two stores in the same statement that write the same value to the same
diff --git a/polly/test/Simplify/pass_existence.ll b/polly/test/Simplify/pass_existence.ll
index fc5287ed2ee2..4d1d800b2a80 100644
--- a/polly/test/Simplify/pass_existence.ll
+++ b/polly/test/Simplify/pass_existence.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-print-simplify -disable-output < %s | FileCheck %s
 ; RUN: opt %loadNPMPolly -disable-output "-passes=scop(print<polly-simplify>)" < %s -aa-pipeline=basic-aa < %s | FileCheck %s
 ;
 ; Simple test for the existence of the Simplify pass.
diff --git a/polly/test/Simplify/phi_in_regionstmt.ll b/polly/test/Simplify/phi_in_regionstmt.ll
index 32bb75427589..76efd484f547 100644
--- a/polly/test/Simplify/phi_in_regionstmt.ll
+++ b/polly/test/Simplify/phi_in_regionstmt.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; The PHINode %cond91.sink.sink.us.sink.6 is in the middle of a region
diff --git a/polly/test/Simplify/pr33323.ll b/polly/test/Simplify/pr33323.ll
index 751f0bff5961..22921d5fba50 100644
--- a/polly/test/Simplify/pr33323.ll
+++ b/polly/test/Simplify/pr33323.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s
 ;
 ; llvm.org/PR33323
 ;
diff --git a/polly/test/Simplify/redundant.ll b/polly/test/Simplify/redundant.ll
index e85352bc889f..540e537460e5 100644
--- a/polly/test/Simplify/redundant.ll
+++ b/polly/test/Simplify/redundant.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Remove redundant store (a store that writes the same value already
diff --git a/polly/test/Simplify/redundant_differentindex.ll b/polly/test/Simplify/redundant_differentindex.ll
index 23531c24344f..5ce25836dedb 100644
--- a/polly/test/Simplify/redundant_differentindex.ll
+++ b/polly/test/Simplify/redundant_differentindex.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; A store that has a different index than the load it is storing is
diff --git a/polly/test/Simplify/redundant_region.ll b/polly/test/Simplify/redundant_region.ll
index dbcb420ac2f3..927aac6c4af0 100644
--- a/polly/test/Simplify/redundant_region.ll
+++ b/polly/test/Simplify/redundant_region.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Remove redundant store (a store that writes the same value already
 ; at the destination) in a region.
diff --git a/polly/test/Simplify/redundant_region_scalar.ll b/polly/test/Simplify/redundant_region_scalar.ll
index 95a581ad6f57..72d570d46bdc 100644
--- a/polly/test/Simplify/redundant_region_scalar.ll
+++ b/polly/test/Simplify/redundant_region_scalar.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Remove redundant store (a store that writes the same value already
 ; at the destination) in a region.
diff --git a/polly/test/Simplify/redundant_scalarwrite.ll b/polly/test/Simplify/redundant_scalarwrite.ll
index e2f7bbedc023..84cb971be11f 100644
--- a/polly/test/Simplify/redundant_scalarwrite.ll
+++ b/polly/test/Simplify/redundant_scalarwrite.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Remove redundant scalar stores.
 ;
diff --git a/polly/test/Simplify/redundant_storebetween.ll b/polly/test/Simplify/redundant_storebetween.ll
index f624b6e5b995..6540d7751e46 100644
--- a/polly/test/Simplify/redundant_storebetween.ll
+++ b/polly/test/Simplify/redundant_storebetween.ll
@@ -1,4 +1,3 @@
-; RUN: opt %loadPolly -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
 ; RUN: opt %loadNPMPolly "-passes=scop(print<polly-simplify>)" -disable-output -aa-pipeline=basic-aa < %s | FileCheck %s -match-full-lines
 ;
 ; Don't remove store where there is another store to the same target
diff --git a/polly/test/Simplify/scalability1.ll b/polly/test/Simplify/scalability1.ll
index 0ef99ce1ad8e..c6e36f9dcdef 100644
--- a/polly/test/Simplify/scalability1.ll
+++ b/polly/test/Simplify/scalability1.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-ignore-inbounds -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-ignore-inbounds '-passes=print<polly-simplify>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Test scalability.
 ;
diff --git a/polly/test/Simplify/scalability2.ll b/polly/test/Simplify/scalability2.ll
index bac0810b0afa..adcf9eef348a 100644
--- a/polly/test/Simplify/scalability2.ll
+++ b/polly/test/Simplify/scalability2.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-ignore-inbounds -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly -polly-ignore-inbounds '-passes=print<polly-simplify>' -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Test scalability.
 ;
diff --git a/polly/test/Simplify/sweep_mapped_phi.ll b/polly/test/Simplify/sweep_mapped_phi.ll
index add1681cdf36..495d77a22f61 100644
--- a/polly/test/Simplify/sweep_mapped_phi.ll
+++ b/polly/test/Simplify/sweep_mapped_phi.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Map %phi to A[j], so the scalar write in Stmt_for_bodyA can be removed.
 ;
diff --git a/polly/test/Simplify/sweep_mapped_value.ll b/polly/test/Simplify/sweep_mapped_value.ll
index 2e2f9c37febe..c83941a8f0ba 100644
--- a/polly/test/Simplify/sweep_mapped_value.ll
+++ b/polly/test/Simplify/sweep_mapped_value.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-import-jscop -polly-import-jscop-postfix=transformed -polly-print-simplify -disable-output < %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-simplify>' -polly-import-jscop-postfix=transformed -disable-output < %s | FileCheck %s -match-full-lines
 ;
 ; Map %val to A[j], so the scalar write on Stmt_for_bodyB can be removed.
 ;
diff --git a/polly/test/Simplify/ununsed_read_in_region_entry.ll b/polly/test/Simplify/ununsed_read_in_region_entry.ll
index 9b2d4521e2d6..f2436c263a96 100644
--- a/polly/test/Simplify/ununsed_read_in_region_entry.ll
+++ b/polly/test/Simplify/ununsed_read_in_region_entry.ll
@@ -1,5 +1,5 @@
-; RUN: opt %loadPolly -polly-print-simplify -disable-output< %s | FileCheck %s -match-full-lines
-; RUN: opt %loadPolly -polly-simplify -polly-codegen -S < %s | FileCheck %s -check-prefix=CODEGEN
+; RUN: opt %loadNPMPolly '-passes=print<polly-simplify>' -disable-output< %s | FileCheck %s -match-full-lines
+; RUN: opt %loadNPMPolly '-passes=polly-simplify,polly-codegen' -S < %s | FileCheck %s -check-prefix=CODEGEN
 ;
 ; for (int i = 0; i < n; i+=1) {
 ;    (void)A[0];
diff --git a/polly/test/Support/Plugins.ll b/polly/test/Support/Plugins.ll
index cee878f1c6ac..872a32fad4fe 100644
--- a/polly/test/Support/Plugins.ll
+++ b/polly/test/Support/Plugins.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadNPMPolly -passes='polly-prepare,scop(print<polly-ast>)' -S < %s \
+; RUN: opt %loadNPMPolly '-passes=polly-prepare,scop(print<polly-ast>)' -S < %s \
 ; RUN: | FileCheck %s
 
 ; This testcase tests plugin registration. Check-lines below serve to verify
diff --git a/polly/test/Support/isl-args.ll b/polly/test/Support/isl-args.ll
index efa94194bc3f..206cb73bfc5a 100644
--- a/polly/test/Support/isl-args.ll
+++ b/polly/test/Support/isl-args.ll
@@ -1,7 +1,7 @@
-; RUN: opt %loadPolly -polly-scops -disable-output -polly-isl-arg=-V < %s | FileCheck %s -match-full-lines --check-prefix=VERSION
-; RUN: opt %loadPolly -polly-scops -disable-output -polly-isl-arg=-h < %s | FileCheck %s -match-full-lines --check-prefix=HELP
-; RUN: not opt %loadPolly -polly-scops -disable-output -polly-isl-arg=-asdf < %s 2>&1| FileCheck %s -match-full-lines --check-prefix=UNKNOWN
-; RUN: opt %loadPolly -polly-scops -disable-output -polly-isl-arg=--schedule-algorithm=feautrier < %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=-V < %s | FileCheck %s -match-full-lines --check-prefix=VERSION
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=-h < %s | FileCheck %s -match-full-lines --check-prefix=HELP
+; RUN: not opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=-asdf < %s 2>&1| FileCheck %s -match-full-lines --check-prefix=UNKNOWN
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output -polly-isl-arg=--schedule-algorithm=feautrier < %s
 
 ; VERSION: isl-{{.*}}-IMath-32
 ; HELP: Usage: -polly-isl-arg [OPTION...]
diff --git a/polly/test/lit.site.cfg.in b/polly/test/lit.site.cfg.in
index b44061260834..d8a0b6ae3a3b 100644
--- a/polly/test/lit.site.cfg.in
+++ b/polly/test/lit.site.cfg.in
@@ -48,7 +48,6 @@ else:
     config.substitutions.append(('%loadPolly', commonOpts ))
     config.substitutions.append(('%loadNPMPolly', commonOpts ))
 
-
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
 
diff --git a/polly/test/polly.ll b/polly/test/polly.ll
index f78cceacfb12..2e455b39a9cd 100644
--- a/polly/test/polly.ll
+++ b/polly/test/polly.ll
@@ -1,4 +1,4 @@
-; RUN: opt %loadPolly -polly-scops -S < %s | FileCheck %s
+; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -S < %s 2>&1 | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 define void @foo() nounwind {
 start:
diff --git a/utils/bazel/.bazelrc b/utils/bazel/.bazelrc
index 5a6d1889076a..09111bcdc834 100644
--- a/utils/bazel/.bazelrc
+++ b/utils/bazel/.bazelrc
@@ -51,9 +51,6 @@ build --experimental_cc_shared_library
 build:zlib_external --repo_env=BAZEL_LLVM_ZLIB_STRATEGY=external
 build:zlib_system --repo_env=BAZEL_LLVM_ZLIB_STRATEGY=system
 
-build:terminfo_external --repo_env=BAZEL_LLVM_TERMINFO_STRATEGY=external
-build:terminfo_system --repo_env=BAZEL_LLVM_TERMINFO_STRATEGY=system
-
 ###############################################################################
 # Options for "generic_clang" builds: these options should generally apply to
 # builds using a Clang-based compiler, and default to the `clang` executable on
diff --git a/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel b/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel
index 1c12c8167ba4..7413b018ef32 100644
--- a/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel
@@ -167,7 +167,10 @@ cc_library(
     ]),
     hdrs = glob([
         "include/bolt/Passes/*.h",
-    ]),
+    ]) + [
+        # To avoid circular dependency on "Profile".
+        "include/bolt/Profile/BoltAddressTranslation.h",
+    ],
     includes = ["include"],
     deps = [
         ":Core",
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index c469da74fc56..81e12b7f108f 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -1881,6 +1881,7 @@ cc_library(
         ":Instrumentation",
         ":MC",
         ":MCParser",
+        ":ObjCARC",
         ":Object",
         ":ProfileData",
         ":Remarks",
diff --git a/utils/bazel/llvm-project-overlay/llvm/driver.bzl b/utils/bazel/llvm-project-overlay/llvm/driver.bzl
index 10796d919834..a57a14ebd5f8 100644
--- a/utils/bazel/llvm-project-overlay/llvm/driver.bzl
+++ b/utils/bazel/llvm-project-overlay/llvm/driver.bzl
@@ -39,6 +39,7 @@ _EXTRA_ALIASES = {
     "clang": ["clang++", "clang-cl", "clang-cpp"],
     "lld": ["ld", "lld-link", "ld.lld", "ld64.lld", "wasm-ld"],
     "llvm-ar": ["ranlib", "lib", "dlltool"],
+    "llvm-cxxfilt": ["c++filt"],
     "llvm-objcopy": ["bitcode-strip", "install-name-tool", "strip"],
     "llvm-objdump": ["otool"],
     "llvm-rc": ["windres"],
diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
index e9385f45c5e5..a4fb47d677ab 100644
--- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
+++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
@@ -222,9 +222,6 @@
 /* Define to 1 if you have the <sys/types.h> header file. */
 #define HAVE_SYS_TYPES_H 1
 
-/* Define if the setupterm() function is supported this platform. */
-/* LLVM_ENABLE_TERMINFO defined in Bazel */
-
 /* Define to 1 if you have the <termios.h> header file. */
 #define HAVE_TERMIOS_H 1
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index fc449e9010ae..a7bbe459fd9d 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -51,10 +51,7 @@ expand_template(
         "#cmakedefine01 MLIR_ENABLE_NVPTXCOMPILER": "#define MLIR_ENABLE_NVPTXCOMPILER 0",
         "#cmakedefine01 MLIR_ENABLE_PDL_IN_PATTERNMATCH": "#define MLIR_ENABLE_PDL_IN_PATTERNMATCH 1",
         "#cmakedefine01 MLIR_ENABLE_ROCM_CONVERSIONS": "#define MLIR_ENABLE_ROCM_CONVERSIONS 0",
-    } | if_cuda_available(
-        {"#cmakedefine01 MLIR_ENABLE_CUDA_CONVERSIONS": "#define MLIR_ENABLE_CUDA_CONVERSIONS 1"},
-        {"#cmakedefine01 MLIR_ENABLE_CUDA_CONVERSIONS": "#define MLIR_ENABLE_CUDA_CONVERSIONS 0"},
-    ),
+    },
     template = "include/mlir/Config/mlir-config.h.cmake",
 )
 
@@ -5616,7 +5613,6 @@ cc_library(
         ":Transforms",
         ":VectorToLLVM",
         ":VectorToSCF",
-        ":config",
     ],
 )
 
@@ -6282,7 +6278,6 @@ cc_library(
         ":NVVMToLLVMIRTranslation",
         ":TargetLLVM",
         ":ToLLVMIRTranslation",
-        ":config",
         "//llvm:NVPTXCodeGen",
         "//llvm:Support",
         "//llvm:config",
@@ -7597,7 +7592,6 @@ cc_library(
         "include/mlir/Transforms/LoopInvariantCodeMotionUtils.h",
         "include/mlir/Transforms/OneToNTypeConversion.h",
         "include/mlir/Transforms/RegionUtils.h",
-        "include/mlir/Transforms/TopologicalSortUtils.h",
     ],
     includes = ["include"],
     deps = [
@@ -8367,6 +8361,7 @@ cc_library(
         ":ArithDialect",
         ":ConversionPassIncGen",
         ":EmitCDialect",
+        ":PDLLAST",
         ":Pass",
         ":TransformUtils",
     ],
@@ -8723,6 +8718,7 @@ cc_library(
     ],
     includes = ["include"],
     deps = [
+        ":Analysis",
         ":DLTIDialect",
         ":IR",
         ":LLVMConversionIncGen",
@@ -8957,6 +8953,7 @@ cc_library(
     hdrs = glob(["include/mlir/Target/LLVMIR/Dialect/OpenACC/*.h"]),
     includes = ["include"],
     deps = [
+        ":Analysis",
         ":IR",
         ":LLVMDialect",
         ":OpenACCDialect",
@@ -8976,6 +8973,7 @@ cc_library(
     hdrs = glob(["include/mlir/Target/LLVMIR/Dialect/OpenMP/*.h"]),
     includes = ["include"],
     deps = [
+        ":Analysis",
         ":IR",
         ":LLVMDialect",
         ":OpenMPCommon",
@@ -9360,7 +9358,6 @@ cc_library(
         ":X86VectorTransforms",
         ":XeGPUDialect",
         ":XeGPUTransforms",
-        ":config",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 258cc88ebbf3..fdf89d00cbb1 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -36,7 +36,7 @@ expand_template(
         "\"@MLIR_BINARY_DIR@\"": "os.environ[\"TEST_UNDECLARED_OUTPUTS_DIR\"]",
         # All disabled, but required to substituted because they are not in quotes.
         "@LLVM_BUILD_EXAMPLES@": "0",
-        "@MLIR_ENABLE_CUDA_CONVERSIONS@": "0",
+        "@LLVM_HAS_NVPTX_TARGET@": "0",
         "@MLIR_ENABLE_CUDA_RUNNER@": "0",
         "@MLIR_ENABLE_ROCM_CONVERSIONS@": "0",
         "@MLIR_ENABLE_ROCM_RUNNER@": "0",
@@ -608,6 +608,7 @@ cc_library(
         ":TestDialect",
         "//mlir:FuncDialect",
         "//mlir:FuncToLLVM",
+        "//mlir:IR",
         "//mlir:LLVMCommonConversion",
         "//mlir:LLVMDialect",
         "//mlir:Pass",
@@ -951,10 +952,10 @@ cc_library(
         "//mlir:ArmSMEToSCF",
         "//mlir:ArmSMETransforms",
         "//mlir:ArmSVETransforms",
-	"//mlir:FuncDialect",
+        "//mlir:FuncDialect",
         "//mlir:IR",
         "//mlir:Pass",
-	"//mlir:SCFToControlFlow",
+        "//mlir:SCFToControlFlow",
         "//mlir:Transforms",
         "//mlir:VectorToArmSME",
         "//mlir:VectorToSCF",
diff --git a/utils/bazel/llvm_configs/config.h.cmake b/utils/bazel/llvm_configs/config.h.cmake
index 977c182e9d2b..ff30741c8f36 100644
--- a/utils/bazel/llvm_configs/config.h.cmake
+++ b/utils/bazel/llvm_configs/config.h.cmake
@@ -209,9 +209,6 @@
 /* Define to 1 if you have the <sys/types.h> header file. */
 #cmakedefine HAVE_SYS_TYPES_H ${HAVE_SYS_TYPES_H}
 
-/* Define if the setupterm() function is supported this platform. */
-#cmakedefine LLVM_ENABLE_TERMINFO ${LLVM_ENABLE_TERMINFO}
-
 /* Define to 1 if you have the <termios.h> header file. */
 #cmakedefine HAVE_TERMIOS_H ${HAVE_TERMIOS_H}
author	Michael Kruse <llvm-project@meinersbur.de>	2024-05-25 17:21:09 +0200
committer	Michael Kruse <llvm-project@meinersbur.de>	2024-05-25 17:21:09 +0200
commit	062fdd4f4439c00437fef07488e994a6ff10bb5d (patch)
tree	79297e3188951f7b98d10f3d67a92f4df75bac80
parent	0e864bbd4142cf202aa9ffd66eb67c9528c0f452 (diff)
parent	9da81cee219da78ab44357310a3bcf481bdba26c (diff)